diff --git a/0001-Initial-public-Mesa-SWR.patch b/0001-Initial-public-Mesa-SWR.patch
deleted file mode 100644
index 528678e..0000000
--- a/0001-Initial-public-Mesa-SWR.patch
+++ /dev/null
@@ -1,6428 +0,0 @@
-From 293435cf5955935a6ce43bf59a6d743aad8be6d8 Mon Sep 17 00:00:00 2001
-From: Tim Rowley <timothy.o.rowley@intel.com>
-Date: Mon, 19 Oct 2015 13:31:29 -0500
-Subject: [PATCH 1/3] Initial public Mesa+SWR
-
----
- README.md                                          |   33 +
- configure.ac                                       |   54 +
- src/gallium/Makefile.am                            |    4 +
- src/gallium/SConscript                             |    1 +
- src/gallium/auxiliary/gallivm/lp_bld_flow.h        |    7 +
- src/gallium/auxiliary/gallivm/lp_bld_init.h        |    7 +
- src/gallium/auxiliary/gallivm/lp_bld_sample.h      |    6 +
- src/gallium/auxiliary/gallivm/lp_bld_tgsi.h        |    8 +
- .../auxiliary/target-helpers/inline_sw_helper.h    |   13 +-
- .../target-helpers/inline_wrapper_sw_helper.h      |    2 +-
- src/gallium/drivers/swr/.clang-format              |   64 +
- src/gallium/drivers/swr/Automake.inc               |   28 +
- src/gallium/drivers/swr/Makefile.am                |   82 ++
- src/gallium/drivers/swr/Makefile.sources           |  114 ++
- src/gallium/drivers/swr/SConscript                 |   69 +
- src/gallium/drivers/swr/swr_clear.cpp              |  141 ++
- src/gallium/drivers/swr/swr_context.cpp            |  392 ++++++
- src/gallium/drivers/swr/swr_context.h              |  172 +++
- src/gallium/drivers/swr/swr_context_llvm.h         |  124 ++
- src/gallium/drivers/swr/swr_draw.cpp               |  277 ++++
- src/gallium/drivers/swr/swr_fence.cpp              |  141 ++
- src/gallium/drivers/swr/swr_fence.h                |   73 ++
- src/gallium/drivers/swr/swr_memory.h               |   99 ++
- src/gallium/drivers/swr/swr_public.h               |   40 +
- src/gallium/drivers/swr/swr_query.cpp              |  334 +++++
- src/gallium/drivers/swr/swr_query.h                |   48 +
- src/gallium/drivers/swr/swr_resource.h             |   98 ++
- src/gallium/drivers/swr/swr_scratch.cpp            |  116 ++
- src/gallium/drivers/swr/swr_scratch.h              |   63 +
- src/gallium/drivers/swr/swr_screen.cpp             |  666 ++++++++++
- src/gallium/drivers/swr/swr_screen.h               |   52 +
- src/gallium/drivers/swr/swr_shader.cpp             |  608 +++++++++
- src/gallium/drivers/swr/swr_shader.h               |   61 +
- src/gallium/drivers/swr/swr_state.cpp              | 1344 ++++++++++++++++++++
- src/gallium/drivers/swr/swr_state.h                |  240 ++++
- src/gallium/drivers/swr/swr_tex_sample.cpp         |  338 +++++
- src/gallium/drivers/swr/swr_tex_sample.h           |   47 +
- src/gallium/targets/libgl-xlib/Makefile.am         |    5 +
- src/gallium/targets/libgl-xlib/SConscript          |    4 +
- src/gallium/targets/osmesa/Makefile.am             |    6 +
- 40 files changed, 5979 insertions(+), 2 deletions(-)
- create mode 100644 README.md
- create mode 100644 src/gallium/drivers/swr/.clang-format
- create mode 100644 src/gallium/drivers/swr/Automake.inc
- create mode 100644 src/gallium/drivers/swr/Makefile.am
- create mode 100644 src/gallium/drivers/swr/Makefile.sources
- create mode 100644 src/gallium/drivers/swr/SConscript
- create mode 100644 src/gallium/drivers/swr/swr_clear.cpp
- create mode 100644 src/gallium/drivers/swr/swr_context.cpp
- create mode 100644 src/gallium/drivers/swr/swr_context.h
- create mode 100644 src/gallium/drivers/swr/swr_context_llvm.h
- create mode 100644 src/gallium/drivers/swr/swr_draw.cpp
- create mode 100644 src/gallium/drivers/swr/swr_fence.cpp
- create mode 100644 src/gallium/drivers/swr/swr_fence.h
- create mode 100644 src/gallium/drivers/swr/swr_memory.h
- create mode 100644 src/gallium/drivers/swr/swr_public.h
- create mode 100644 src/gallium/drivers/swr/swr_query.cpp
- create mode 100644 src/gallium/drivers/swr/swr_query.h
- create mode 100644 src/gallium/drivers/swr/swr_resource.h
- create mode 100644 src/gallium/drivers/swr/swr_scratch.cpp
- create mode 100644 src/gallium/drivers/swr/swr_scratch.h
- create mode 100644 src/gallium/drivers/swr/swr_screen.cpp
- create mode 100644 src/gallium/drivers/swr/swr_screen.h
- create mode 100644 src/gallium/drivers/swr/swr_shader.cpp
- create mode 100644 src/gallium/drivers/swr/swr_shader.h
- create mode 100644 src/gallium/drivers/swr/swr_state.cpp
- create mode 100644 src/gallium/drivers/swr/swr_state.h
- create mode 100644 src/gallium/drivers/swr/swr_tex_sample.cpp
- create mode 100644 src/gallium/drivers/swr/swr_tex_sample.h
-
-diff --git a/README.md b/README.md
-new file mode 100644
-index 0000000..3bf3031
---- /dev/null
-+++ b/README.md
-@@ -0,0 +1,33 @@
-+OpenSWR-Mesa
-+============
-+
-+Overview
-+--------
-+
-+This is repository of the integration work combining the high
-+performance, highly scalable core SWR rasterizer with Mesa.  A more
-+complete introduction and discussion towards upstreaming to the Mesa
-+project can be found on the mesa-dev mailing list.
-+
-+Notes
-+-----
-+
-+* SWR is set as the default software renderer.  Use
-+GALLIUM_DRIVER=llvmpipe to switch to Mesa's standard rasterizer.  This
-+particular change is to make it easier for people evaluating OpenSWR,
-+and will not be upstreamed.
-+
-+* LLVM-3.6 is required.
-+
-+* To build SWR with autoconf, include the following in the config
-+line: "--with-gallium-drivers=swr --enable-swr-native".
-+
-+* Build defaults to AVX2; for a version to run on AVX build with
-+  "--with-swr-arch=AVX".
-+
-+* To build SWR with SCons, nothing needs to be done - it is built by
-+  default.
-+
-+* Code for the driver is in src/gallium/drivers/swr
-+
-+* Code for the rasterizer is in src/gallium/drivers/swr/rasterizer
-diff --git a/configure.ac b/configure.ac
-index d3df195..f216dc7 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -1753,6 +1753,11 @@ AC_SUBST([LLVM_LIBS])
- AC_SUBST([LLVM_LDFLAGS])
- AC_SUBST([LLVM_INCLUDEDIR])
- AC_SUBST([LLVM_VERSION])
-+AC_SUBST([SWR_LIBDIR])
-+AC_SUBST([SWR_ARCH])
-+AC_SUBST([SWR_ARCH_FLAG])
-+AC_SUBST([SWR_NATIVE])
-+AC_SUBST([SWR_INCLUDEDIR])
- AC_SUBST([CLANG_RESOURCE_DIR])
- 
- case "x$enable_opengl$enable_gles1$enable_gles2" in
-@@ -2177,6 +2182,9 @@ if test -n "$with_gallium_drivers"; then
-                 HAVE_GALLIUM_LLVMPIPE=yes
-             fi
-             ;;
-+        xswr)
-+            HAVE_GALLIUM_SWR=yes
-+            ;;
-         xvc4)
-             HAVE_GALLIUM_VC4=yes
-             gallium_require_drm "vc4"
-@@ -2243,6 +2251,41 @@ if test "x$MESA_LLVM" != x0; then
-     fi
- fi
- 
-+dnl SWR include/library
-+
-+AC_ARG_WITH([swr-includedir],
-+    [AS_HELP_STRING([--with-swr-includedir], [Path to SWR includes])],
-+    [SWR_INCLUDEDIR="$withval"],
-+    [SWR_INCLUDEDIR=''])
-+
-+AC_ARG_WITH([swr-libdir],
-+    [AS_HELP_STRING([--with-swr-libdir], [Path to SWR library])],
-+    [SWR_LIBDIR="$withval"],
-+    [SWR_LIBDIR=''])
-+
-+AC_ARG_WITH([swr-arch],
-+    [AS_HELP_STRING([--with-swr-arch], [AVX architecture for swr (AVX | CORE_AVX2) ])],
-+    [SWR_ARCH="$withval"],
-+    [SWR_ARCH="CORE-AVX2"])
-+
-+case "$SWR_ARCH" in
-+"AVX")
-+    SWR_ARCH_FLAG='-march=core-avx-i -DKNOB_ARCH=KNOB_ARCH_AVX '
-+    ;;
-+"CORE-AVX2")
-+    SWR_ARCH_FLAG='-march=core-avx2 -DKNOB_ARCH=KNOB_ARCH_AVX2 '
-+    ;;
-+**)
-+    SWR_ARCH_FLAG='-march=core-avx2 -DKNOB_ARCH=KNOB_ARCH_AVX2 '
-+esac
-+
-+AC_ARG_ENABLE([swr-native],
-+    [AS_HELP_STRING([--enable-swr-native],
-+        [use in-tree version of SWR core @<:@default=disabled@:>@])],
-+    [enable_swr_native="$enableval"],
-+    [enable_swr_native=no]
-+)
-+
- AM_CONDITIONAL(HAVE_GALLIUM_SVGA, test "x$HAVE_GALLIUM_SVGA" = xyes)
- AM_CONDITIONAL(HAVE_GALLIUM_I915, test "x$HAVE_GALLIUM_I915" = xyes)
- AM_CONDITIONAL(HAVE_GALLIUM_ILO, test "x$HAVE_GALLIUM_ILO" = xyes)
-@@ -2255,6 +2298,8 @@ AM_CONDITIONAL(HAVE_GALLIUM_NOUVEAU, test "x$HAVE_GALLIUM_NOUVEAU" = xyes)
- AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes)
- AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes)
- AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes)
-+AM_CONDITIONAL(HAVE_GALLIUM_SWR, test "x$HAVE_GALLIUM_SWR" = xyes)
-+AM_CONDITIONAL(SWR_NATIVE, test "x$enable_swr_native" = xyes)
- AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes)
- 
- AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers" = xno)
-@@ -2374,6 +2419,7 @@ AC_CONFIG_FILES([Makefile
- 		src/gallium/drivers/rbug/Makefile
- 		src/gallium/drivers/softpipe/Makefile
- 		src/gallium/drivers/svga/Makefile
-+		src/gallium/drivers/swr/Makefile
- 		src/gallium/drivers/trace/Makefile
- 		src/gallium/drivers/vc4/Makefile
- 		src/gallium/state_trackers/clover/Makefile
-@@ -2562,6 +2608,14 @@ if test "x$MESA_LLVM" = x1; then
-     echo "        LLVM_LDFLAGS:    $LLVM_LDFLAGS"
-     echo ""
- fi
-+if test "x$HAVE_GALLIUM_SWR" = xyes; then
-+    echo "        SWR_INCLUDEDIR:  $SWR_INCLUDEDIR"
-+    echo "        SWR_LIBDIR:      $SWR_LIBDIR"
-+    echo "        SWR_ARCH:        $SWR_ARCH"
-+    echo "        SWR_ARCH_FLAG:   $SWR_ARCH_FLAG"
-+    echo "        SWR_NATIVE:      $enable_swr_native"
-+    echo ""
-+fi
- echo "        PYTHON2:         $PYTHON2"
- 
- echo ""
-diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
-index a7c3606..dcce6a3 100644
---- a/src/gallium/Makefile.am
-+++ b/src/gallium/Makefile.am
-@@ -77,6 +77,10 @@ SUBDIRS += drivers/llvmpipe
- endif
- endif
- 
-+if HAVE_GALLIUM_SWR
-+SUBDIRS += drivers/swr
-+endif
-+
- ## vc4/rpi
- if HAVE_GALLIUM_VC4
- SUBDIRS += drivers/vc4 winsys/vc4/drm
-diff --git a/src/gallium/SConscript b/src/gallium/SConscript
-index fa5fa6e..766c24a 100644
---- a/src/gallium/SConscript
-+++ b/src/gallium/SConscript
-@@ -17,6 +17,7 @@ SConscript([
-     'drivers/softpipe/SConscript',
-     'drivers/svga/SConscript',
-     'drivers/trace/SConscript',
-+    'drivers/swr/SConscript',
- ])
- 
- #
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
-index 0da849b..083b0ad 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
-@@ -37,6 +37,9 @@
- 
- #include "gallivm/lp_bld.h"
- 
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
- 
- struct lp_type;
- 
-@@ -198,4 +201,8 @@ lp_build_array_alloca(struct gallivm_state *gallivm,
-                       LLVMValueRef count,
-                       const char *name);
- 
-+#ifdef __cplusplus
-+}
-+#endif
-+
- #endif /* !LP_BLD_FLOW_H */
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
-index 9e50f88..ab44661 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
-@@ -35,6 +35,9 @@
- #include "lp_bld.h"
- #include <llvm-c/ExecutionEngine.h>
- 
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
- 
- struct gallivm_state
- {
-@@ -82,4 +85,8 @@ void
- lp_set_store_alignment(LLVMValueRef Inst,
- 		       unsigned Align);
- 
-+#ifdef __cplusplus
-+}
-+#endif
-+
- #endif /* !LP_BLD_INIT_H */
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
-index eba758d..5f53c47 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
-@@ -42,6 +42,9 @@
- #include "gallivm/lp_bld_type.h"
- #include "gallivm/lp_bld_swizzle.h"
- 
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
- 
- struct pipe_resource;
- struct pipe_sampler_view;
-@@ -612,5 +615,8 @@ lp_build_minify(struct lp_build_context *bld,
-                 LLVMValueRef level,
-                 boolean lod_scalar);
- 
-+#ifdef __cplusplus
-+}
-+#endif
- 
- #endif /* LP_BLD_SAMPLE_H */
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
-index 2ca9c61..189d03d 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
-@@ -48,6 +48,10 @@
- #include "tgsi/tgsi_scan.h"
- #include "tgsi/tgsi_info.h"
- 
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
- #define LP_CHAN_ALL ~0
- 
- #define LP_MAX_INSTRUCTIONS 256
-@@ -661,4 +665,8 @@ lp_build_tgsi_llvm(
-    struct lp_build_tgsi_context * bld_base,
-    const struct tgsi_token *tokens);
- 
-+#ifdef __cplusplus
-+}
-+#endif
-+
- #endif /* LP_BLD_TGSI_H */
-diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
-index 5f46552..e67dd17 100644
---- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
-+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
-@@ -19,6 +19,10 @@
- #include "llvmpipe/lp_public.h"
- #endif
- 
-+#ifdef GALLIUM_SWR
-+#include "swr/swr_public.h"
-+#endif
-+
- 
- static inline struct pipe_screen *
- sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
-@@ -30,6 +34,11 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
-       screen = llvmpipe_create_screen(winsys);
- #endif
- 
-+#if defined(GALLIUM_SWR)
-+   if (screen == NULL && strcmp(driver, "swr") == 0)
-+      screen = swr_create_screen(winsys);
-+#endif
-+
- #if defined(GALLIUM_SOFTPIPE)
-    if (screen == NULL)
-       screen = softpipe_create_screen(winsys);
-@@ -45,7 +54,9 @@ sw_screen_create(struct sw_winsys *winsys)
-    const char *default_driver;
-    const char *driver;
- 
--#if defined(GALLIUM_LLVMPIPE)
-+#if defined(GALLIUM_SWR)
-+   default_driver = "swr";
-+#elif defined(GALLIUM_LLVMPIPE)
-    default_driver = "llvmpipe";
- #elif defined(GALLIUM_SOFTPIPE)
-    default_driver = "softpipe";
-diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
-index 4f38ba9..d707b8b 100644
---- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
-+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
-@@ -12,7 +12,7 @@
- static inline struct pipe_screen *
- sw_screen_wrap(struct pipe_screen *screen)
- {
--#if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE)
-+#if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE) || defined(GALLIUM_SWR)
-    struct sw_winsys *sws;
-    struct pipe_screen *sw_screen = NULL;
-    const char *driver;
-diff --git a/src/gallium/drivers/swr/.clang-format b/src/gallium/drivers/swr/.clang-format
-new file mode 100644
-index 0000000..0ec65a5
---- /dev/null
-+++ b/src/gallium/drivers/swr/.clang-format
-@@ -0,0 +1,64 @@
-+---
-+Language:        Cpp
-+AccessModifierOffset: -3
-+AlignAfterOpenBracket: true
-+AlignEscapedNewlinesLeft: false
-+AlignOperands:   false
-+AlignTrailingComments: false
-+AllowAllParametersOfDeclarationOnNextLine: true
-+AllowShortBlocksOnASingleLine: false
-+AllowShortCaseLabelsOnASingleLine: false
-+AllowShortIfStatementsOnASingleLine: false
-+AllowShortLoopsOnASingleLine: false
-+AllowShortFunctionsOnASingleLine: All
-+AlwaysBreakAfterDefinitionReturnType: true
-+AlwaysBreakTemplateDeclarations: false
-+AlwaysBreakBeforeMultilineStrings: false
-+BreakBeforeBinaryOperators: NonAssignment
-+BreakBeforeTernaryOperators: true
-+BreakConstructorInitializersBeforeComma: true
-+BinPackParameters: false
-+BinPackArguments: false
-+ColumnLimit:     78
-+ConstructorInitializerAllOnOneLineOrOnePerLine: false
-+ConstructorInitializerIndentWidth: 3
-+DerivePointerAlignment: false
-+ExperimentalAutoDetectBinPacking: false
-+IndentCaseLabels: false
-+IndentWrappedFunctionNames: false
-+IndentFunctionDeclarationAfterType: false
-+MaxEmptyLinesToKeep: 2
-+KeepEmptyLinesAtTheStartOfBlocks: true
-+NamespaceIndentation: Inner
-+ObjCBlockIndentWidth: 3
-+ObjCSpaceAfterProperty: true
-+ObjCSpaceBeforeProtocolList: true
-+PenaltyBreakBeforeFirstCallParameter: 19
-+PenaltyBreakComment: 300
-+PenaltyBreakString: 1000
-+PenaltyBreakFirstLessLess: 120
-+PenaltyExcessCharacter: 1000000
-+PenaltyReturnTypeOnItsOwnLine: 0
-+PointerAlignment: Right
-+SpacesBeforeTrailingComments: 1
-+Cpp11BracedListStyle: true
-+Standard:        Cpp11
-+IndentWidth:     3
-+TabWidth:        8
-+UseTab:          Never
-+BreakBeforeBraces: Linux
-+SpacesInParentheses: false
-+SpacesInSquareBrackets: false
-+SpacesInAngles:  false
-+SpaceInEmptyParentheses: false
-+SpacesInCStyleCastParentheses: false
-+SpaceAfterCStyleCast: false
-+SpacesInContainerLiterals: true
-+SpaceBeforeAssignmentOperators: true
-+ContinuationIndentWidth: 3
-+CommentPragmas:  '^ IWYU pragma:'
-+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-+SpaceBeforeParens: ControlStatements
-+DisableFormat:   false
-+...
-+
-diff --git a/src/gallium/drivers/swr/Automake.inc b/src/gallium/drivers/swr/Automake.inc
-new file mode 100644
-index 0000000..8e66744
---- /dev/null
-+++ b/src/gallium/drivers/swr/Automake.inc
-@@ -0,0 +1,28 @@
-+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+#
-+# Permission is hereby granted, free of charge, to any person obtaining a
-+# copy of this software and associated documentation files (the "Software"),
-+# to deal in the Software without restriction, including without limitation
-+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+# and/or sell copies of the Software, and to permit persons to whom the
-+# Software is furnished to do so, subject to the following conditions:
-+#
-+# The above copyright notice and this permission notice (including the next
-+# paragraph) shall be included in all copies or substantial portions of the
-+# Software.
-+#
-+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+# IN THE SOFTWARE.
-+
-+if HAVE_GALLIUM_SWR
-+
-+TARGET_CPPFLAGS += -DGALLIUM_SWR
-+TARGET_LIB_DEPS += \
-+	$(top_builddir)/src/gallium/drivers/swr/libmesaswr.la
-+
-+endif
-diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
-new file mode 100644
-index 0000000..5dff02c
---- /dev/null
-+++ b/src/gallium/drivers/swr/Makefile.am
-@@ -0,0 +1,82 @@
-+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+#
-+# Permission is hereby granted, free of charge, to any person obtaining a
-+# copy of this software and associated documentation files (the "Software"),
-+# to deal in the Software without restriction, including without limitation
-+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+# and/or sell copies of the Software, and to permit persons to whom the
-+# Software is furnished to do so, subject to the following conditions:
-+#
-+# The above copyright notice and this permission notice (including the next
-+# paragraph) shall be included in all copies or substantial portions of the
-+# Software.
-+#
-+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+# IN THE SOFTWARE.
-+
-+AUTOMAKE_OPTIONS = subdir-objects
-+
-+include Makefile.sources
-+include $(top_srcdir)/src/gallium/Automake.inc
-+
-+AM_CXXFLAGS = \
-+	$(GALLIUM_DRIVER_CFLAGS) \
-+	-std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS \
-+	$(SWR_ARCH_FLAG) \
-+	$(LLVM_CFLAGS)
-+
-+noinst_LTLIBRARIES = libmesaswr.la
-+
-+libmesaswr_la_SOURCES = $(CXX_SOURCES)
-+
-+libmesaswr_la_LDFLAGS =
-+
-+if SWR_NATIVE
-+BUILT_SOURCES = \
-+	rasterizer/scripts/gen_knobs.cpp \
-+	rasterizer/scripts/gen_knobs.h \
-+	rasterizer/jitter/state_llvm.h
-+
-+rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h: rasterizer/scripts/gen_knobs.py rasterizer/scripts/knob_defs.py rasterizer/scripts/templates/knobs.template
-+	$(PYTHON2) $(PYTHON_FLAGS) \
-+		$(srcdir)/rasterizer/scripts/gen_knobs.py \
-+		rasterizer/scripts
-+
-+rasterizer/jitter/state_llvm.h: rasterizer/jitter/scripts/gen_llvm_types.py rasterizer/core/state.h
-+	$(PYTHON2) $(PYTHON_FLAGS) \
-+		$(srcdir)/rasterizer/jitter/scripts/gen_llvm_types.py \
-+		--input $(srcdir)/rasterizer/core/state.h \
-+		--output rasterizer/jitter/state_llvm.h
-+
-+libmesaswr_la_SOURCES += \
-+	$(COMMON_CXX_SOURCES) \
-+	$(CORE_CXX_SOURCES) \
-+	$(JITTER_CXX_SOURCES) \
-+	$(MEMORY_CXX_SOURCES) \
-+	rasterizer/scripts/gen_knobs.cpp \
-+	rasterizer/scripts/gen_knobs.h
-+AM_CXXFLAGS += \
-+	-I$(srcdir)/rasterizer \
-+	-I$(srcdir)/rasterizer/core \
-+	-I$(srcdir)/rasterizer/jitter \
-+	-I$(builddir)/rasterizer/scripts \
-+	-I$(builddir)/rasterizer/jitter
-+else
-+libmesaswr_la_LDFLAGS += -L$(SWR_LIBDIR) -lSWR
-+AM_CXXFLAGS += \
-+	-I$(SWR_INCLUDEDIR) \
-+	-I$(SWR_INCLUDEDIR)/core \
-+	-I$(SWR_INCLUDEDIR)/jitter \
-+	-I$(SWR_INCLUDEDIR)/build/jitter \
-+	-I$(SWR_INCLUDEDIR)/build/scripts
-+endif
-+
-+libmesaswr_la_LDFLAGS += -lnuma
-+
-+
-+EXTRA_DIST = SConscript
-diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
-new file mode 100644
-index 0000000..1c6fe08
---- /dev/null
-+++ b/src/gallium/drivers/swr/Makefile.sources
-@@ -0,0 +1,114 @@
-+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+#
-+# Permission is hereby granted, free of charge, to any person obtaining a
-+# copy of this software and associated documentation files (the "Software"),
-+# to deal in the Software without restriction, including without limitation
-+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+# and/or sell copies of the Software, and to permit persons to whom the
-+# Software is furnished to do so, subject to the following conditions:
-+#
-+# The above copyright notice and this permission notice (including the next
-+# paragraph) shall be included in all copies or substantial portions of the
-+# Software.
-+#
-+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+# IN THE SOFTWARE.
-+
-+CXX_SOURCES := \
-+	swr_clear.cpp \
-+	swr_context.cpp \
-+	swr_context.h \
-+	swr_context_llvm.h \
-+	swr_draw.cpp \
-+	swr_public.h \
-+	swr_resource.h \
-+	swr_screen.cpp \
-+	swr_screen.h \
-+	swr_state.cpp \
-+	swr_state.h \
-+	swr_tex_sample.cpp \
-+	swr_tex_sample.h \
-+	swr_scratch.h \
-+	swr_scratch.cpp \
-+	swr_shader.cpp \
-+	swr_memory.h \
-+	swr_fence.h \
-+	swr_fence.cpp \
-+	swr_query.h \
-+	swr_query.cpp
-+
-+COMMON_CXX_SOURCES := \
-+    rasterizer/common/containers.hpp \
-+    rasterizer/common/formats.cpp \
-+    rasterizer/common/formats.h \
-+    rasterizer/common/isa.hpp \
-+    rasterizer/common/os.h \
-+    rasterizer/common/rdtsc_buckets.cpp \
-+    rasterizer/common/rdtsc_buckets.h \
-+    rasterizer/common/rdtsc_buckets_shared.h \
-+    rasterizer/common/rdtsc_buckets_shared.h \
-+    rasterizer/common/simdintrin.h \
-+    rasterizer/common/swr_assert.cpp \
-+    rasterizer/common/swr_assert.h
-+
-+CORE_CXX_SOURCES := \
-+    rasterizer/core/api.cpp \
-+    rasterizer/core/api.h \
-+    rasterizer/core/arena.cpp \
-+    rasterizer/core/arena.h \
-+    rasterizer/core/backend.cpp \
-+    rasterizer/core/backend.h \
-+    rasterizer/core/blend.h \
-+    rasterizer/core/clip.cpp \
-+    rasterizer/core/clip.h \
-+    rasterizer/core/context.h \
-+    rasterizer/core/depthstencil.h \
-+    rasterizer/core/fifo.hpp \
-+    rasterizer/core/format_traits.h \
-+    rasterizer/core/format_types.h \
-+    rasterizer/core/frontend.cpp \
-+    rasterizer/core/frontend.h \
-+    rasterizer/core/knobs.h \
-+    rasterizer/core/knobs_init.h \
-+    rasterizer/core/multisample.h \
-+    rasterizer/core/pa_avx.cpp \
-+    rasterizer/core/pa.h \
-+    rasterizer/core/rasterizer.cpp \
-+    rasterizer/core/rasterizer.h \
-+    rasterizer/core/rdtsc_core.cpp \
-+    rasterizer/core/rdtsc_core.h \
-+    rasterizer/core/state.h \
-+    rasterizer/core/threads.cpp \
-+    rasterizer/core/threads.h \
-+    rasterizer/core/tilemgr.cpp \
-+    rasterizer/core/tilemgr.h \
-+    rasterizer/core/utils.cpp \
-+    rasterizer/core/utils.h
-+
-+JITTER_CXX_SOURCES := \
-+    rasterizer/jitter/blend_jit.cpp \
-+    rasterizer/jitter/blend_jit.h \
-+    rasterizer/jitter/builder.cpp \
-+    rasterizer/jitter/builder_gen.cpp \
-+    rasterizer/jitter/builder_gen.h \
-+    rasterizer/jitter/builder.h \
-+    rasterizer/jitter/builder_misc.cpp \
-+    rasterizer/jitter/builder_misc.h \
-+    rasterizer/jitter/builder_x86.cpp \
-+    rasterizer/jitter/builder_x86.h \
-+    rasterizer/jitter/fetch_jit.cpp \
-+    rasterizer/jitter/fetch_jit.h \
-+    rasterizer/jitter/JitManager.cpp \
-+    rasterizer/jitter/JitManager.h \
-+    rasterizer/jitter/streamout_jit.cpp \
-+    rasterizer/jitter/streamout_jit.h
-+
-+MEMORY_CXX_SOURCES := \
-+    rasterizer/memory/ClearTile.cpp \
-+    rasterizer/memory/LoadTile.cpp \
-+    rasterizer/memory/StoreTile.cpp
-diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
-new file mode 100644
-index 0000000..4c8c121
---- /dev/null
-+++ b/src/gallium/drivers/swr/SConscript
-@@ -0,0 +1,69 @@
-+from sys import executable as python_cmd
-+import distutils.version
-+
-+Import('*')
-+
-+if not env['llvm']:
-+    print 'warning: LLVM disabled: not building swr'
-+    Return()
-+
-+env = env.Clone()
-+
-+env.MSVC2008Compat()
-+
-+env.Append(CPPDEFINES = [
-+	'__STDC_CONSTANT_MACROS',
-+	'__STDC_LIMIT_MACROS',
-+    'KNOB_ARCH=KNOB_ARCH_AVX2',
-+	])
-+
-+env.Append(CCFLAGS = [
-+    '-std=c++11',
-+    '-march=core-avx2',
-+    ])
-+
-+env.Prepend(CPPPATH = [
-+    'rasterizer',
-+    'rasterizer/core',
-+    'rasterizer/jitter',
-+    'rasterizer/scripts',
-+    ])
-+
-+gen_knobs = env.CodeGenerate(
-+    target = 'rasterizer/scripts/gen_knobs.cpp',
-+    script = 'rasterizer/scripts/gen_knobs.py',
-+    source = [],
-+    command = python_cmd + ' $SCRIPT ' + Dir('rasterizer/scripts').abspath
-+)
-+
-+gen_knobs = env.CodeGenerate(
-+    target = 'rasterizer/scripts/gen_knobs.h',
-+    script = 'rasterizer/scripts/gen_knobs.py',
-+    source = [],
-+    command = python_cmd + ' $SCRIPT ' + Dir('rasterizer/scripts').abspath
-+)
-+
-+state_llvm = env.CodeGenerate(
-+    target = 'rasterizer/jitter/state_llvm.h',
-+    script = 'rasterizer/jitter/scripts/gen_llvm_types.py',
-+    source = 'rasterizer/core/state.h',
-+    command = python_cmd + ' $SCRIPT --input $SOURCE --output $TARGET'
-+)
-+
-+source = ['rasterizer/scripts/gen_knobs.cpp', 'rasterizer/scripts/gen_knobs.h']
-+source += env.ParseSourceList('Makefile.sources', [
-+    'CXX_SOURCES',
-+    'COMMON_CXX_SOURCES',
-+    'CORE_CXX_SOURCES',
-+    'JITTER_CXX_SOURCES',
-+    'MEMORY_CXX_SOURCES'
-+])
-+
-+swr = env.ConvenienceLibrary(
-+	target = 'swr',
-+	source = source,
-+	)
-+
-+env.Alias('swr', swr)
-+
-+Export('swr')
-diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp
-new file mode 100644
-index 0000000..7704359
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_clear.cpp
-@@ -0,0 +1,141 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "swr_context.h"
-+#include "swr_query.h"
-+
-+static void
-+swr_clear(struct pipe_context *pipe,
-+          unsigned buffers,
-+          const union pipe_color_union *color,
-+          double depth,
-+          unsigned stencil)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct pipe_framebuffer_state *fb = &ctx->framebuffer;
-+
-+   UINT clearMask = 0;
-+
-+   if (!swr_check_render_cond(pipe))
-+      return;
-+
-+   if (ctx->dirty)
-+      swr_update_derived(ctx);
-+
-+/* Update clearMask/targetMask */
-+#if 0 /* XXX SWR currently only clears SWR_ATTACHMENT_COLOR0, don't bother   \
-+         checking others yet. */
-+   if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
-+      UINT i;
-+      for (i = 0; i < fb->nr_cbufs; ++i)
-+         if (fb->cbufs[i])
-+            clearMask |= (SWR_CLEAR_COLOR0 << i);
-+   }
-+#else
-+   if (buffers & PIPE_CLEAR_COLOR && fb->cbufs[0])
-+      clearMask |= SWR_CLEAR_COLOR;
-+#endif
-+
-+   if (buffers & PIPE_CLEAR_DEPTH && fb->zsbuf)
-+      clearMask |= SWR_CLEAR_DEPTH;
-+
-+   if (buffers & PIPE_CLEAR_STENCIL && fb->zsbuf)
-+      clearMask |= SWR_CLEAR_STENCIL;
-+
-+#if 0 // XXX HACK, override clear color alpha. On ubuntu, clears are
-+      // transparent.
-+   ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */
-+#endif
-+
-+   /* Reset viewport to full framebuffer width/height before clear, then
-+    * restore it  */
-+   /* Scissor affects clear, viewport should not */
-+   ctx->dirty |= SWR_NEW_VIEWPORT;
-+   SWR_VIEWPORT vp = {0};
-+   vp.width = ctx->framebuffer.width;
-+   vp.height = ctx->framebuffer.height;
-+   SwrSetViewports(ctx->swrContext, 1, &vp, NULL);
-+
-+   SwrClearRenderTarget(ctx->swrContext, clearMask, color->f, depth, stencil);
-+}
-+
-+
-+#if 0 // XXX, these don't get called. how to get these called?  Do we need
-+      // them?  Docs?
-+static void
-+swr_clear_render_target(struct pipe_context *pipe, struct pipe_surface *ps,
-+                        const union pipe_color_union *color,
-+                        unsigned x, unsigned y, unsigned w, unsigned h)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   fprintf(stderr, "SWR swr_clear_render_target!\n");
-+
-+   ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR;
-+}
-+
-+static void
-+swr_clear_depth_stencil(struct pipe_context *pipe, struct pipe_surface *ps,
-+                        unsigned buffers, double depth, unsigned stencil,
-+                        unsigned x, unsigned y, unsigned w, unsigned h)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   fprintf(stderr, "SWR swr_clear_depth_stencil!\n");
-+
-+   ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR;
-+}
-+
-+static void
-+swr_clear_buffer(struct pipe_context *pipe,
-+                 struct pipe_resource *res,
-+                 unsigned offset, unsigned size,
-+                 const void *data, int data_size)
-+{
-+   fprintf(stderr, "SWR swr_clear_buffer!\n");
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct swr_resource *buf = swr_resource(res);
-+   union pipe_color_union color;
-+   enum pipe_format dst_fmt;
-+   unsigned width, height, elements;
-+
-+   assert(res->target == PIPE_BUFFER);
-+   assert(buf);
-+   assert(size % data_size == 0);
-+
-+   SWR_SURFACE_STATE &swr_buffer = buf->swr;
-+
-+   ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR;
-+}
-+#endif
-+
-+
-+void
-+swr_clear_init(struct pipe_context *pipe)
-+{
-+   pipe->clear = swr_clear;
-+#if 0 // XXX, these don't get called. how to get these called?  Do we need
-+      // them?  Docs?
-+   pipe->clear_render_target = swr_clear_render_target;
-+   pipe->clear_depth_stencil = swr_clear_depth_stencil;
-+   pipe->clear_buffer = swr_clear_buffer;
-+#endif
-+}
-diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
-new file mode 100644
-index 0000000..6269cd0
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_context.cpp
-@@ -0,0 +1,392 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "util/u_memory.h"
-+#include "util/u_inlines.h"
-+#include "util/u_format.h"
-+
-+extern "C" {
-+#include "util/u_transfer.h"
-+#include "util/u_surface.h"
-+}
-+
-+#include "swr_context.h"
-+#include "swr_memory.h"
-+#include "swr_screen.h"
-+#include "swr_resource.h"
-+#include "swr_scratch.h"
-+#include "swr_query.h"
-+
-+#include "api.h"
-+
-+static struct pipe_surface *
-+swr_create_surface(struct pipe_context *pipe,
-+                   struct pipe_resource *pt,
-+                   const struct pipe_surface *surf_tmpl)
-+{
-+   struct pipe_surface *ps;
-+
-+   ps = CALLOC_STRUCT(pipe_surface);
-+   if (ps) {
-+      pipe_reference_init(&ps->reference, 1);
-+      pipe_resource_reference(&ps->texture, pt);
-+      ps->context = pipe;
-+      ps->format = surf_tmpl->format;
-+      if (pt->target != PIPE_BUFFER) {
-+         assert(surf_tmpl->u.tex.level <= pt->last_level);
-+         ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level);
-+         ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level);
-+         ps->u.tex.level = surf_tmpl->u.tex.level;
-+         ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
-+         ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
-+         if (ps->u.tex.first_layer != ps->u.tex.last_layer) {
-+            debug_printf("creating surface with multiple layers, rendering "
-+                         "to first layer only\n");
-+         }
-+      } else {
-+         /* setting width as number of elements should get us correct
-+          * renderbuffer width */
-+         ps->width = surf_tmpl->u.buf.last_element
-+            - surf_tmpl->u.buf.first_element + 1;
-+         ps->height = pt->height0;
-+         ps->u.buf.first_element = surf_tmpl->u.buf.first_element;
-+         ps->u.buf.last_element = surf_tmpl->u.buf.last_element;
-+         assert(ps->u.buf.first_element <= ps->u.buf.last_element);
-+         assert(ps->u.buf.last_element < ps->width);
-+      }
-+   }
-+   return ps;
-+}
-+
-+static void
-+swr_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf)
-+{
-+   assert(surf->texture);
-+   struct pipe_resource *resource = surf->texture;
-+
-+   /* If the surface being destroyed is a current render target,
-+    * call StoreTiles to resolve the hotTile state then set attachment
-+    * to NULL.
-+    */
-+   if (resource->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL
-+                         | PIPE_BIND_DISPLAY_TARGET)) {
-+      struct swr_context *ctx = swr_context(pipe);
-+      struct swr_resource *spr = swr_resource(resource);
-+      for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++)
-+         if (ctx->current.attachment[i] == &spr->swr) {
-+            swr_store_render_target(ctx, i, SWR_TILE_RESOLVED);
-+            ctx->current.attachment[i] = nullptr;
-+            /*
-+             * Mesa thinks depth/stencil are fused, so we'll never get an
-+             * explicit resource for stencil.  So, if checking depth, then
-+             * also
-+             * check for stencil.
-+             */
-+            if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) {
-+               swr_store_render_target(
-+                  ctx, SWR_ATTACHMENT_STENCIL, SWR_TILE_RESOLVED);
-+               ctx->current.attachment[SWR_ATTACHMENT_STENCIL] = nullptr;
-+            }
-+
-+            SwrWaitForIdle(ctx->swrContext);
-+            break;
-+         }
-+   }
-+
-+   pipe_resource_reference(&surf->texture, NULL);
-+   FREE(surf);
-+}
-+
-+
-+static void *
-+swr_transfer_map(struct pipe_context *pipe,
-+                 struct pipe_resource *resource,
-+                 unsigned level,
-+                 unsigned usage,
-+                 const struct pipe_box *box,
-+                 struct pipe_transfer **transfer)
-+{
-+   struct swr_resource *spr = swr_resource(resource);
-+   struct pipe_transfer *pt;
-+   enum pipe_format format = resource->format;
-+
-+   assert(resource);
-+   assert(level <= resource->last_level);
-+
-+   /*
-+    * If mapping any attached rendertarget, store tiles and wait for idle
-+    * before giving CPU access to the surface.
-+    * (set postStoreTileState to SWR_TILE_INVALID so tiles are reloaded)
-+    */
-+   if (resource->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL
-+                         | PIPE_BIND_DISPLAY_TARGET)) {
-+      struct swr_context *ctx = swr_context(pipe);
-+      for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++)
-+         if (ctx->current.attachment[i] == &spr->swr) {
-+            swr_store_render_target(ctx, i, SWR_TILE_INVALID);
-+            /*
-+             * Mesa thinks depth/stencil are fused, so we'll never get an
-+             * explicit map for stencil.  So, if mapping depth, then also
-+             * store tile for stencil.
-+             */
-+            if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH))
-+               swr_store_render_target(
-+                  ctx, SWR_ATTACHMENT_STENCIL, SWR_TILE_INVALID);
-+            SwrWaitForIdle(ctx->swrContext);
-+            break;
-+         }
-+   }
-+
-+
-+   pt = CALLOC_STRUCT(pipe_transfer);
-+   if (!pt)
-+      return NULL;
-+   pipe_resource_reference(&pt->resource, resource);
-+   pt->level = level;
-+   pt->box = *box;
-+   pt->stride = spr->row_stride[level];
-+   pt->layer_stride = spr->img_stride[level];
-+
-+   /* if we're mapping the depth/stencil, copy in stencil */
-+   if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT
-+       && spr->has_stencil) {
-+      for (unsigned i = 0; i < spr->alignedWidth * spr->alignedHeight; i++) {
-+         spr->swr.pBaseAddress[4 * i + 3] = spr->secondary.pBaseAddress[i];
-+      }
-+   }
-+
-+   unsigned offset = box->z * pt->layer_stride + box->y * pt->stride
-+      + box->x * util_format_get_blocksize(format);
-+
-+   *transfer = pt;
-+
-+   return spr->swr.pBaseAddress + offset + spr->mip_offsets[level];
-+}
-+
-+static void
-+swr_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer)
-+{
-+   assert(transfer->resource);
-+
-+   /*
-+    * XXX TODO: use fences and come up with a real resource manager.
-+    *
-+    * If this resource has been mapped/unmapped, it's probably in use.  Tag it
-+    *with this context so
-+    * we'll know to check dependencies when it's deleted.
-+    */
-+   struct swr_resource *res = swr_resource(transfer->resource);
-+   res->bound_to_context = (void *)pipe;
-+
-+   /* if we're mapping the depth/stencil, copy out stencil */
-+   if (res->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT
-+       && res->has_stencil) {
-+      for (unsigned i = 0; i < res->alignedWidth * res->alignedHeight; i++) {
-+         res->secondary.pBaseAddress[i] = res->swr.pBaseAddress[4 * i + 3];
-+      }
-+   }
-+
-+   pipe_resource_reference(&transfer->resource, NULL);
-+   FREE(transfer);
-+}
-+
-+
-+static void
-+swr_resource_copy(struct pipe_context *pipe,
-+                  struct pipe_resource *dst,
-+                  unsigned dst_level,
-+                  unsigned dstx,
-+                  unsigned dsty,
-+                  unsigned dstz,
-+                  struct pipe_resource *src,
-+                  unsigned src_level,
-+                  const struct pipe_box *src_box)
-+{
-+   if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER)
-+       || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) {
-+      util_resource_copy_region(
-+         pipe, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box);
-+      return;
-+   }
-+
-+   debug_printf("unhandled swr_resource_copy\n");
-+}
-+
-+
-+static void
-+swr_blit(struct pipe_context *pipe, const struct pipe_blit_info *blit_info)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct pipe_blit_info info = *blit_info;
-+
-+   if (blit_info->render_condition_enable && !swr_check_render_cond(pipe))
-+      return;
-+
-+   if (info.src.resource->nr_samples > 1 && info.dst.resource->nr_samples <= 1
-+       && !util_format_is_depth_or_stencil(info.src.resource->format)
-+       && !util_format_is_pure_integer(info.src.resource->format)) {
-+      debug_printf("swr: color resolve unimplemented\n");
-+      return;
-+   }
-+
-+   if (util_try_blit_via_copy_region(pipe, &info)) {
-+      return; /* done */
-+   }
-+
-+   if (info.mask & PIPE_MASK_S) {
-+      debug_printf("swr: cannot blit stencil, skipping\n");
-+      info.mask &= ~PIPE_MASK_S;
-+   }
-+
-+   if (!util_blitter_is_blit_supported(ctx->blitter, &info)) {
-+      debug_printf("swr: blit unsupported %s -> %s\n",
-+                   util_format_short_name(info.src.resource->format),
-+                   util_format_short_name(info.dst.resource->format));
-+      return;
-+   }
-+
-+   /* XXX turn off occlusion and streamout queries */
-+
-+   util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer);
-+   util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems);
-+   util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs);
-+   /*util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs);*/
-+   util_blitter_save_so_targets(
-+      ctx->blitter,
-+      ctx->num_so_targets,
-+      (struct pipe_stream_output_target **)ctx->so_targets);
-+   util_blitter_save_rasterizer(ctx->blitter, (void *)ctx->rasterizer);
-+   util_blitter_save_viewport(ctx->blitter, &ctx->viewport);
-+   util_blitter_save_scissor(ctx->blitter, &ctx->scissor);
-+   util_blitter_save_fragment_shader(ctx->blitter, ctx->fs);
-+   util_blitter_save_blend(ctx->blitter, (void *)ctx->blend);
-+   util_blitter_save_depth_stencil_alpha(ctx->blitter,
-+                                         (void *)ctx->depth_stencil);
-+   util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref);
-+   util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask);
-+   util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer);
-+   util_blitter_save_fragment_sampler_states(
-+      ctx->blitter,
-+      ctx->num_samplers[PIPE_SHADER_FRAGMENT],
-+      (void **)ctx->samplers[PIPE_SHADER_FRAGMENT]);
-+   util_blitter_save_fragment_sampler_views(
-+      ctx->blitter,
-+      ctx->num_sampler_views[PIPE_SHADER_FRAGMENT],
-+      ctx->sampler_views[PIPE_SHADER_FRAGMENT]);
-+   util_blitter_save_render_condition(ctx->blitter,
-+                                      ctx->render_cond_query,
-+                                      ctx->render_cond_cond,
-+                                      ctx->render_cond_mode);
-+
-+   util_blitter_blit(ctx->blitter, &info);
-+}
-+
-+
-+static void
-+swr_destroy(struct pipe_context *pipe)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (ctx->blitter)
-+      util_blitter_destroy(ctx->blitter);
-+
-+   if (ctx->swrContext)
-+      SwrDestroyContext(ctx->swrContext);
-+
-+   delete ctx->blendJIT;
-+
-+   swr_destroy_scratch_buffers(ctx);
-+
-+   FREE(ctx);
-+}
-+
-+
-+static void
-+swr_render_condition(struct pipe_context *pipe,
-+                     struct pipe_query *query,
-+                     boolean condition,
-+                     uint mode)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   ctx->render_cond_query = query;
-+   ctx->render_cond_mode = mode;
-+   ctx->render_cond_cond = condition;
-+}
-+
-+
-+struct pipe_context *
-+swr_create_context(struct pipe_screen *screen, void *priv)
-+{
-+   struct swr_context *ctx = CALLOC_STRUCT(swr_context);
-+   ctx->blendJIT =
-+      new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>;
-+
-+   SWR_CREATECONTEXT_INFO createInfo;
-+   createInfo.driver = GL;
-+   createInfo.privateStateSize = sizeof(swr_draw_context);
-+   createInfo.pfnLoadTile = swr_LoadHotTile;
-+   createInfo.pfnStoreTile = swr_StoreHotTile;
-+   createInfo.pfnClearTile = swr_StoreHotTileClear;
-+   ctx->swrContext = SwrCreateContext(&createInfo);
-+
-+   /* Init Load/Store/ClearTiles Tables */
-+   swr_InitMemoryModule();
-+
-+   if (ctx->swrContext == NULL)
-+      goto fail;
-+
-+   ctx->pipe.screen = screen;
-+   ctx->pipe.destroy = swr_destroy;
-+   ctx->pipe.priv = priv;
-+   ctx->pipe.create_surface = swr_create_surface;
-+   ctx->pipe.surface_destroy = swr_surface_destroy;
-+   ctx->pipe.transfer_map = swr_transfer_map;
-+   ctx->pipe.transfer_unmap = swr_transfer_unmap;
-+
-+   ctx->pipe.transfer_flush_region = u_default_transfer_flush_region;
-+   ctx->pipe.transfer_inline_write = u_default_transfer_inline_write;
-+
-+   ctx->pipe.resource_copy_region = swr_resource_copy;
-+   ctx->pipe.render_condition = swr_render_condition;
-+
-+   swr_state_init(&ctx->pipe);
-+   swr_clear_init(&ctx->pipe);
-+   swr_draw_init(&ctx->pipe);
-+   swr_query_init(&ctx->pipe);
-+
-+   ctx->pipe.blit = swr_blit;
-+   ctx->blitter = util_blitter_create(&ctx->pipe);
-+   if (!ctx->blitter) {
-+      goto fail;
-+   }
-+
-+   swr_init_scratch_buffers(ctx);
-+
-+   return &ctx->pipe;
-+
-+fail:
-+   /* Should really validate the init steps and fail gracefully */
-+   swr_destroy(&ctx->pipe);
-+   return NULL;
-+}
-diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
-new file mode 100644
-index 0000000..9d93a6d
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_context.h
-@@ -0,0 +1,172 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_CONTEXT_H
-+#define SWR_CONTEXT_H
-+
-+#include "pipe/p_context.h"
-+#include "pipe/p_state.h"
-+#include "util/u_blitter.h"
-+#include "jit_api.h"
-+#include "swr_state.h"
-+#include <unordered_map>
-+
-+#define SWR_NEW_BLEND (1 << 0)
-+#define SWR_NEW_RASTERIZER (1 << 1)
-+#define SWR_NEW_DEPTH_STENCIL_ALPHA (1 << 2)
-+#define SWR_NEW_SAMPLER (1 << 3)
-+#define SWR_NEW_SAMPLER_VIEW (1 << 4)
-+#define SWR_NEW_VS (1 << 5)
-+#define SWR_NEW_FS (1 << 6)
-+#define SWR_NEW_VSCONSTANTS (1 << 7)
-+#define SWR_NEW_FSCONSTANTS (1 << 8)
-+#define SWR_NEW_VERTEX (1 << 9)
-+#define SWR_NEW_STIPPLE (1 << 10)
-+#define SWR_NEW_SCISSOR (1 << 11)
-+#define SWR_NEW_VIEWPORT (1 << 12)
-+#define SWR_NEW_FRAMEBUFFER (1 << 13)
-+#define SWR_NEW_CLIP (1 << 14)
-+#define SWR_NEW_SO (1 << 15)
-+#define SWR_NEW_ALL 0x0000ffff
-+
-+namespace std
-+{
-+template <> struct hash<BLEND_COMPILE_STATE> {
-+   std::size_t operator()(const BLEND_COMPILE_STATE &k) const
-+   {
-+      return util_hash_crc32(&k, sizeof(k));
-+   }
-+};
-+};
-+
-+struct swr_context {
-+   struct pipe_context pipe; /**< base class */
-+
-+   HANDLE swrContext;
-+
-+   /** Constant state objects */
-+   struct swr_blend_state *blend;
-+   struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
-+   struct pipe_depth_stencil_alpha_state *depth_stencil;
-+   struct pipe_rasterizer_state *rasterizer;
-+
-+   struct swr_vertex_shader *vs;
-+   struct swr_fragment_shader *fs;
-+   struct swr_vertex_element_state *velems;
-+
-+   /** Other rendering state */
-+   struct pipe_blend_color blend_color;
-+   struct pipe_stencil_ref stencil_ref;
-+   struct pipe_clip_state clip;
-+   struct pipe_constant_buffer
-+      constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
-+   struct pipe_framebuffer_state framebuffer;
-+   struct pipe_poly_stipple poly_stipple;
-+   struct pipe_scissor_state scissor;
-+   struct pipe_sampler_view *
-+      sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
-+
-+   struct pipe_viewport_state viewport;
-+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
-+   struct pipe_index_buffer index_buffer;
-+
-+   struct blitter_context *blitter;
-+
-+   /** Conditional query object and mode */
-+   struct pipe_query *render_cond_query;
-+   uint render_cond_mode;
-+   boolean render_cond_cond;
-+   unsigned active_queries;
-+
-+   unsigned num_vertex_buffers;
-+   unsigned num_samplers[PIPE_SHADER_TYPES];
-+   unsigned num_sampler_views[PIPE_SHADER_TYPES];
-+
-+   unsigned sample_mask;
-+
-+   // streamout
-+   pipe_stream_output_target *so_targets[MAX_SO_STREAMS];
-+   uint32_t num_so_targets;
-+
-+   /* Temp storage for user_buffer constants */
-+   struct swr_scratch_buffers *scratch;
-+
-+   // blend jit functions
-+   std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC> *blendJIT;
-+
-+   /* Shadows of current SWR API DrawState */
-+   struct swr_shadow_state current;
-+
-+   unsigned dirty; /**< Mask of SWR_NEW_x flags */
-+};
-+
-+struct swr_jit_texture {
-+   uint32_t width; // same as number of elements
-+   uint32_t height;
-+   uint32_t depth; // doubles as array size
-+   uint32_t first_level;
-+   uint32_t last_level;
-+   const void *base_ptr;
-+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
-+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
-+   uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
-+};
-+
-+struct swr_jit_sampler {
-+   float min_lod;
-+   float max_lod;
-+   float lod_bias;
-+   float border_color[4];
-+};
-+
-+struct swr_draw_context {
-+   const float *constantVS[PIPE_MAX_CONSTANT_BUFFERS];
-+   unsigned num_constantsVS[PIPE_MAX_CONSTANT_BUFFERS];
-+   const float *constantFS[PIPE_MAX_CONSTANT_BUFFERS];
-+   unsigned num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS];
-+
-+   swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-+   swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS];
-+   swr_jit_texture texturesFS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-+   swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS];
-+
-+   SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS];
-+};
-+
-+
-+static INLINE struct swr_context *
-+swr_context(struct pipe_context *pipe)
-+{
-+   return (struct swr_context *)pipe;
-+}
-+
-+struct pipe_context *swr_create_context(struct pipe_screen *, void *priv);
-+
-+void swr_state_init(struct pipe_context *pipe);
-+
-+void swr_clear_init(struct pipe_context *pipe);
-+
-+void swr_draw_init(struct pipe_context *pipe);
-+
-+void swr_finish(struct pipe_context *pipe);
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_context_llvm.h b/src/gallium/drivers/swr/swr_context_llvm.h
-new file mode 100644
-index 0000000..58da813
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_context_llvm.h
-@@ -0,0 +1,124 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#pragma once
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Generate LLVM type information for swr_jit_texture
-+INLINE static StructType *
-+Gen_swr_jit_texture(JitManager *pShG)
-+{
-+   LLVMContext &ctx = pShG->mContext;
-+   std::vector<Type *> members;
-+
-+   members.push_back(Type::getInt32Ty(ctx)); // width
-+   members.push_back(Type::getInt32Ty(ctx)); // height
-+   members.push_back(Type::getInt32Ty(ctx)); // depth
-+   members.push_back(Type::getInt32Ty(ctx)); // first_level
-+   members.push_back(Type::getInt32Ty(ctx)); // last_level
-+   members.push_back(PointerType::get(Type::getInt8Ty(ctx), 0)); // base_ptr
-+   members.push_back(ArrayType::get(Type::getInt32Ty(ctx),
-+                                    PIPE_MAX_TEXTURE_LEVELS)); // row_stride
-+   members.push_back(ArrayType::get(Type::getInt32Ty(ctx),
-+                                    PIPE_MAX_TEXTURE_LEVELS)); // img_stride
-+   members.push_back(ArrayType::get(Type::getInt32Ty(ctx),
-+                                    PIPE_MAX_TEXTURE_LEVELS)); // mip_offsets
-+
-+   return StructType::get(ctx, members, false);
-+}
-+
-+static const UINT swr_jit_texture_width = 0;
-+static const UINT swr_jit_texture_height = 1;
-+static const UINT swr_jit_texture_depth = 2;
-+static const UINT swr_jit_texture_first_level = 3;
-+static const UINT swr_jit_texture_last_level = 4;
-+static const UINT swr_jit_texture_base_ptr = 5;
-+static const UINT swr_jit_texture_row_stride = 6;
-+static const UINT swr_jit_texture_img_stride = 7;
-+static const UINT swr_jit_texture_mip_offsets = 8;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Generate LLVM type information for swr_jit_sampler
-+INLINE static StructType *
-+Gen_swr_jit_sampler(JitManager *pShG)
-+{
-+   LLVMContext &ctx = pShG->mContext;
-+   std::vector<Type *> members;
-+
-+   members.push_back(Type::getFloatTy(ctx)); // min_lod
-+   members.push_back(Type::getFloatTy(ctx)); // max_lod
-+   members.push_back(Type::getFloatTy(ctx)); // lod_bias
-+   members.push_back(
-+      ArrayType::get(Type::getFloatTy(ctx), 4)); // border_color
-+
-+   return StructType::get(ctx, members, false);
-+}
-+
-+static const UINT swr_jit_sampler_min_lod = 0;
-+static const UINT swr_jit_sampler_max_lod = 1;
-+static const UINT swr_jit_sampler_lod_bias = 2;
-+static const UINT swr_jit_sampler_border_color = 3;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Generate LLVM type information for swr_draw_context
-+INLINE static StructType *
-+Gen_swr_draw_context(JitManager *pShG)
-+{
-+   LLVMContext &ctx = pShG->mContext;
-+   std::vector<Type *> members;
-+
-+   members.push_back(
-+      ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0),
-+                     PIPE_MAX_CONSTANT_BUFFERS)); // constantVS
-+   members.push_back(ArrayType::get(
-+      Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsVS
-+   members.push_back(
-+      ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0),
-+                     PIPE_MAX_CONSTANT_BUFFERS)); // constantFS
-+   members.push_back(ArrayType::get(
-+      Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsFS
-+   members.push_back(
-+      ArrayType::get(Gen_swr_jit_texture(pShG),
-+                     PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesVS
-+   members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG),
-+                                    PIPE_MAX_SAMPLERS)); // samplersVS
-+   members.push_back(
-+      ArrayType::get(Gen_swr_jit_texture(pShG),
-+                     PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesFS
-+   members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG),
-+                                    PIPE_MAX_SAMPLERS)); // samplersFS
-+   members.push_back(ArrayType::get(Gen_SWR_SURFACE_STATE(pShG),
-+                                    SWR_NUM_ATTACHMENTS)); // renderTargets
-+
-+   return StructType::get(ctx, members, false);
-+}
-+
-+static const UINT swr_draw_context_constantVS = 0;
-+static const UINT swr_draw_context_num_constantsVS = 1;
-+static const UINT swr_draw_context_constantFS = 2;
-+static const UINT swr_draw_context_num_constantsFS = 3;
-+static const UINT swr_draw_context_texturesVS = 4;
-+static const UINT swr_draw_context_samplersVS = 5;
-+static const UINT swr_draw_context_texturesFS = 6;
-+static const UINT swr_draw_context_samplersFS = 7;
-+static const UINT swr_draw_context_renderTargets = 8;
-diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp
-new file mode 100644
-index 0000000..797ebdc
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_draw.cpp
-@@ -0,0 +1,277 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "swr_screen.h"
-+#include "swr_context.h"
-+#include "swr_resource.h"
-+#include "swr_fence.h"
-+#include "swr_query.h"
-+#include "jit_api.h"
-+
-+#include "util/u_draw.h"
-+#include "util/u_prim.h"
-+
-+/*
-+ * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY
-+ */
-+static INLINE enum PRIMITIVE_TOPOLOGY
-+swr_convert_prim_topology(const unsigned mode)
-+{
-+   switch (mode) {
-+   case PIPE_PRIM_POINTS:
-+      return TOP_POINT_LIST;
-+   case PIPE_PRIM_LINES:
-+      return TOP_LINE_LIST;
-+   case PIPE_PRIM_LINE_LOOP:
-+      return TOP_LINE_LOOP;
-+   case PIPE_PRIM_LINE_STRIP:
-+      return TOP_LINE_STRIP;
-+   case PIPE_PRIM_TRIANGLES:
-+      return TOP_TRIANGLE_LIST;
-+   case PIPE_PRIM_TRIANGLE_STRIP:
-+      return TOP_TRIANGLE_STRIP;
-+   case PIPE_PRIM_TRIANGLE_FAN:
-+      return TOP_TRIANGLE_FAN;
-+   case PIPE_PRIM_QUADS:
-+      return TOP_QUAD_LIST;
-+   case PIPE_PRIM_QUAD_STRIP:
-+      return TOP_QUAD_STRIP;
-+   case PIPE_PRIM_POLYGON:
-+      return TOP_TRIANGLE_FAN; /* XXX TOP_POLYGON; */
-+   case PIPE_PRIM_LINES_ADJACENCY:
-+      return TOP_LINE_LIST_ADJ;
-+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-+      return TOP_LISTSTRIP_ADJ;
-+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-+      return TOP_TRI_LIST_ADJ;
-+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-+      return TOP_TRI_STRIP_ADJ;
-+   default:
-+      assert(0 && "Unknown topology");
-+      return TOP_UNKNOWN;
-+   }
-+};
-+
-+
-+/*
-+ * Draw vertex arrays, with optional indexing, optional instancing.
-+ */
-+static void
-+swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (!swr_check_render_cond(pipe))
-+      return;
-+
-+   if (info->indirect) {
-+      util_draw_indirect(pipe, info);
-+      return;
-+   }
-+
-+   /* Update derived state, pass draw info to update function */
-+   if (ctx->dirty)
-+      swr_update_derived(ctx, info);
-+
-+   if (ctx->vs->pipe.stream_output.num_outputs) {
-+      if (!ctx->vs->soFunc[info->mode]) {
-+         STREAMOUT_COMPILE_STATE state = {0};
-+         struct pipe_stream_output_info *so = &ctx->vs->pipe.stream_output;
-+
-+         state.numVertsPerPrim = u_vertices_per_prim(info->mode);
-+
-+         uint32_t offsets[MAX_SO_STREAMS] = {0};
-+         uint32_t num = 0;
-+
-+         for (uint32_t i = 0; i < so->num_outputs; i++) {
-+            assert(so->output[i].stream == 0); // @todo
-+            uint32_t output_buffer = so->output[i].output_buffer;
-+            if (so->output[i].dst_offset != offsets[output_buffer]) {
-+               // hole - need to fill
-+               state.stream.decl[num].bufferIndex = output_buffer;
-+               state.stream.decl[num].hole = true;
-+               state.stream.decl[num].componentMask =
-+                  (1 << (so->output[i].dst_offset - offsets[output_buffer]))
-+                  - 1;
-+               num++;
-+               offsets[output_buffer] = so->output[i].dst_offset;
-+            }
-+
-+            state.stream.decl[num].bufferIndex = output_buffer;
-+            state.stream.decl[num].attribSlot = so->output[i].register_index - 1;
-+            state.stream.decl[num].componentMask =
-+               ((1 << so->output[i].num_components) - 1)
-+               << so->output[i].start_component;
-+            state.stream.decl[num].hole = false;
-+            num++;
-+
-+            offsets[output_buffer] += so->output[i].num_components;
-+         }
-+
-+         state.stream.numDecls = num;
-+
-+         HANDLE hJitMgr = swr_screen(pipe->screen)->hJitMgr;
-+         ctx->vs->soFunc[info->mode] = JitCompileStreamout(hJitMgr, state);
-+         debug_printf("so shader    %p\n", ctx->vs->soFunc[info->mode]);
-+         assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL");
-+      }
-+
-+      SwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0);
-+   }
-+
-+   struct swr_vertex_element_state *velems = ctx->velems;
-+   if (!velems->fsFunc
-+       || (velems->fsState.cutIndex != info->restart_index)
-+       || (velems->fsState.bEnableCutIndex != info->primitive_restart)) {
-+
-+      velems->fsState.cutIndex = info->restart_index;
-+      velems->fsState.bEnableCutIndex = info->primitive_restart;
-+
-+      /* Create Fetch Shader */
-+      HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr;
-+      velems->fsFunc = JitCompileFetch(hJitMgr, velems->fsState);
-+
-+      debug_printf("fetch shader %p\n", velems->fsFunc);
-+      assert(velems->fsFunc && "Error: FetchShader = NULL");
-+   }
-+
-+   SwrSetFetchFunc(ctx->swrContext, velems->fsFunc);
-+
-+   if (info->indexed)
-+      SwrDrawIndexedInstanced(ctx->swrContext,
-+                              swr_convert_prim_topology(info->mode),
-+                              info->count,
-+                              info->instance_count,
-+                              info->start,
-+                              info->index_bias,
-+                              info->start_instance);
-+   else
-+      SwrDrawInstanced(ctx->swrContext,
-+                       swr_convert_prim_topology(info->mode),
-+                       info->count,
-+                       info->instance_count,
-+                       info->start,
-+                       info->start_instance);
-+}
-+
-+
-+static void
-+swr_flush(struct pipe_context *pipe,
-+          struct pipe_fence_handle **fence,
-+          unsigned flags)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct swr_screen *screen = swr_screen(pipe->screen);
-+
-+   /* If the current renderTarget is the display surface, store tiles back to
-+    * the surface, in
-+    * preparation for present (swr_flush_frontbuffer)
-+    */
-+   struct pipe_surface *cb = ctx->framebuffer.cbufs[0];
-+   if (cb && swr_resource(cb->texture)->display_target)
-+      swr_store_render_target(ctx, SWR_ATTACHMENT_COLOR0, SWR_TILE_RESOLVED);
-+
-+   // SwrStoreTiles is asynchronous, always submit the "flush" fence.
-+   // flush_frontbuffer needs it.
-+   swr_fence_submit(ctx, screen->flush_fence);
-+
-+   if (fence)
-+      swr_fence_reference(pipe->screen, fence, screen->flush_fence);
-+}
-+
-+void
-+swr_finish(struct pipe_context *pipe)
-+{
-+   struct swr_screen *screen = swr_screen(pipe->screen);
-+   struct pipe_fence_handle *fence = NULL;
-+
-+   swr_flush(pipe, &fence, 0);
-+   swr_fence_finish(&screen->base, fence, 0);
-+   swr_fence_reference(&screen->base, &fence, NULL);
-+}
-+
-+
-+/*
-+ * Store SWR HotTiles back to RenderTarget surface.
-+ */
-+void
-+swr_store_render_target(struct swr_context *ctx,
-+                        uint32_t attachment,
-+                        enum SWR_TILE_STATE post_tile_state,
-+                        struct SWR_SURFACE_STATE *surface)
-+{
-+   struct swr_draw_context *pDC =
-+      (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
-+   struct SWR_SURFACE_STATE *renderTarget = &pDC->renderTargets[attachment];
-+
-+   /* If the passed in surface isn't already attached, it will be attached and
-+    * then restored. */
-+   if (surface && (surface != ctx->current.attachment[attachment]))
-+      *renderTarget = *surface;
-+
-+   /* Only proceed if there's a valid surface to store to */
-+   if (renderTarget->pBaseAddress) {
-+      /* Set viewport to full renderTarget width/height and disable scissor
-+       * before StoreTiles */
-+      boolean change_viewport =
-+         (ctx->current.vp.x != 0.0f || ctx->current.vp.y != 0.0f
-+          || ctx->current.vp.width != renderTarget->width
-+          || ctx->current.vp.height != renderTarget->height);
-+      if (change_viewport) {
-+         SWR_VIEWPORT vp = {0};
-+         vp.width = renderTarget->width;
-+         vp.height = renderTarget->height;
-+         SwrSetViewports(ctx->swrContext, 1, &vp, NULL);
-+      }
-+
-+      boolean scissor_enable = ctx->current.rastState.scissorEnable;
-+      if (scissor_enable) {
-+         ctx->current.rastState.scissorEnable = FALSE;
-+         SwrSetRastState(ctx->swrContext, &ctx->current.rastState);
-+      }
-+
-+      SwrStoreTiles(ctx->swrContext,
-+                    (enum SWR_RENDERTARGET_ATTACHMENT)attachment,
-+                    post_tile_state);
-+
-+      /* Restore viewport and scissor enable */
-+      if (change_viewport)
-+         SwrSetViewports(ctx->swrContext, 1, &ctx->current.vp, &ctx->current.vpm);
-+      if (scissor_enable) {
-+         ctx->current.rastState.scissorEnable = scissor_enable;
-+         SwrSetRastState(ctx->swrContext, &ctx->current.rastState);
-+      }
-+
-+      /* Restore surface attachment, if changed */
-+      if (surface && (surface != ctx->current.attachment[attachment]))
-+         *renderTarget = *ctx->current.attachment[attachment];
-+   }
-+}
-+
-+
-+void
-+swr_draw_init(struct pipe_context *pipe)
-+{
-+   pipe->draw_vbo = swr_draw_vbo;
-+   pipe->flush = swr_flush;
-+}
-diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp
-new file mode 100644
-index 0000000..aaf7223
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_fence.cpp
-@@ -0,0 +1,141 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "pipe/p_screen.h"
-+#include "util/u_memory.h"
-+#include "os/os_time.h"
-+
-+#include "swr_context.h"
-+#include "swr_screen.h"
-+#include "swr_fence.h"
-+
-+
-+/*
-+ * Fence callback, called by back-end thread on completion of all rendering up
-+ * to SwrSync call.
-+ */
-+static void
-+swr_sync_cb(UINT64 userData, UINT64 userData2)
-+{
-+   struct swr_fence *fence = (struct swr_fence *)userData;
-+
-+   fence->read = fence->write;
-+}
-+
-+/*
-+ * Submit an existing fence.
-+ */
-+void
-+swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fh)
-+{
-+   struct swr_fence *fence = swr_fence(fh);
-+
-+   fence->write++;
-+   SwrSync(ctx->swrContext, swr_sync_cb, (UINT64)fence, 0);
-+}
-+
-+/*
-+ * Create a new fence object.
-+ */
-+struct pipe_fence_handle *
-+swr_fence_create()
-+{
-+   static int fence_id = 0;
-+   struct swr_fence *fence = CALLOC_STRUCT(swr_fence);
-+   if (!fence)
-+      return NULL;
-+
-+   memset(fence, 0, sizeof(*fence));
-+   pipe_reference_init(&fence->reference, 1);
-+   fence->id = fence_id++;
-+
-+   return (struct pipe_fence_handle *)fence;
-+}
-+
-+/** Destroy a fence.  Called when refcount hits zero. */
-+static void
-+swr_fence_destroy(struct swr_fence *fence)
-+{
-+   FREE(fence);
-+}
-+
-+/**
-+ * Set ptr = fence, with reference counting
-+ */
-+void
-+swr_fence_reference(struct pipe_screen *screen,
-+                    struct pipe_fence_handle **ptr,
-+                    struct pipe_fence_handle *f)
-+{
-+   struct swr_fence *fence = swr_fence(f);
-+   struct swr_fence *old;
-+
-+   if (likely(ptr)) {
-+      old = swr_fence(*ptr);
-+      *ptr = f;
-+   } else {
-+      old = NULL;
-+   }
-+
-+   if (pipe_reference(&old->reference, &fence->reference))
-+      swr_fence_destroy(old);
-+}
-+
-+/*
-+ * Wait for the fence to finish.
-+ */
-+boolean
-+swr_fence_finish(struct pipe_screen *screen,
-+                 struct pipe_fence_handle *fence_handle,
-+                 uint64_t timeout)
-+{
-+   struct swr_fence *fence = swr_fence(fence_handle);
-+
-+   while (!swr_is_fence_done(fence))
-+      sched_yield();
-+
-+   return TRUE;
-+}
-+
-+
-+uint64_t
-+swr_get_timestamp(struct pipe_screen *screen)
-+{
-+   return os_time_get_nano();
-+}
-+
-+
-+void
-+swr_fence_init(struct pipe_screen *p_screen)
-+{
-+   p_screen->fence_reference = swr_fence_reference;
-+   p_screen->fence_finish = swr_fence_finish;
-+
-+   p_screen->get_timestamp = swr_get_timestamp;
-+
-+   /*
-+    * Create persistant "flush" fence, submitted when swr_flush is called.
-+    */
-+   struct swr_screen *screen = swr_screen(p_screen);
-+   screen->flush_fence = swr_fence_create();
-+}
-diff --git a/src/gallium/drivers/swr/swr_fence.h b/src/gallium/drivers/swr/swr_fence.h
-new file mode 100644
-index 0000000..317d74c
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_fence.h
-@@ -0,0 +1,73 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included
-+ * in all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
-+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_FENCE_H
-+#define SWR_FENCE_H
-+
-+
-+#include "os/os_thread.h"
-+#include "pipe/p_state.h"
-+#include "util/u_inlines.h"
-+
-+
-+struct pipe_screen;
-+
-+struct swr_fence {
-+   struct pipe_reference reference;
-+
-+   uint64_t read;
-+   uint64_t write;
-+
-+   unsigned id; /* Just for reference */
-+};
-+
-+
-+static inline struct swr_fence *
-+swr_fence(struct pipe_fence_handle *fence)
-+{
-+   return (struct swr_fence *)fence;
-+}
-+
-+static INLINE boolean
-+swr_is_fence_done(struct swr_fence *fence)
-+{
-+   return (fence->read == fence->write);
-+}
-+
-+
-+void swr_fence_init(struct pipe_screen *screen);
-+
-+struct pipe_fence_handle *swr_fence_create();
-+
-+void swr_fence_reference(struct pipe_screen *screen,
-+                         struct pipe_fence_handle **ptr,
-+                         struct pipe_fence_handle *f);
-+
-+boolean swr_fence_finish(struct pipe_screen *screen,
-+                         struct pipe_fence_handle *fence_handle,
-+                         uint64_t timeout);
-+
-+void
-+swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fence);
-+
-+uint64_t swr_get_timestamp(struct pipe_screen *screen);
-+
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h
-new file mode 100644
-index 0000000..d116781
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_memory.h
-@@ -0,0 +1,99 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#pragma once
-+
-+void LoadHotTile(
-+    SWR_SURFACE_STATE *pSrcSurface,
-+    SWR_FORMAT dstFormat,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    UINT x, UINT y, uint32_t renderTargetArrayIndex,
-+    BYTE *pDstHotTile);
-+
-+void StoreHotTile(
-+    SWR_SURFACE_STATE *pDstSurface,
-+    SWR_FORMAT srcFormat,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    UINT x, UINT y, uint32_t renderTargetArrayIndex,
-+    BYTE *pSrcHotTile);
-+
-+void StoreHotTileClear(
-+    SWR_SURFACE_STATE *pDstSurface,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    UINT x,
-+    UINT y,
-+    const float* pClearColor);
-+
-+INLINE void
-+swr_LoadHotTile(HANDLE hPrivateContext,
-+                SWR_FORMAT dstFormat,
-+                SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+                UINT x, UINT y,
-+                uint32_t renderTargetArrayIndex, BYTE* pDstHotTile)
-+{
-+   // Grab source surface state from private context
-+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
-+   SWR_SURFACE_STATE *pSrcSurface = &pDC->renderTargets[renderTargetIndex];
-+
-+   LoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile);
-+}
-+
-+INLINE void
-+swr_StoreHotTile(HANDLE hPrivateContext,
-+                 SWR_FORMAT srcFormat,
-+                 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+                 UINT x, UINT y,
-+                 uint32_t renderTargetArrayIndex, BYTE* pSrcHotTile)
-+{
-+   // Grab destination surface state from private context
-+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
-+   SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
-+
-+   StoreHotTile(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile);
-+}
-+
-+INLINE void
-+swr_StoreHotTileClear(HANDLE hPrivateContext,
-+                      SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+                      UINT x,
-+                      UINT y,
-+                      const float* pClearColor)
-+{
-+   // Grab destination surface state from private context
-+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
-+   SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
-+
-+   StoreHotTileClear(pDstSurface, renderTargetIndex, x, y, pClearColor);
-+}
-+
-+void InitSimLoadTilesTable();
-+void InitSimStoreTilesTable();
-+void InitSimClearTilesTable();
-+
-+/* Init Load/Store/ClearTiles Tables */
-+INLINE void swr_InitMemoryModule()
-+{
-+   InitSimLoadTilesTable();
-+   InitSimStoreTilesTable();
-+   InitSimClearTilesTable();
-+}
-diff --git a/src/gallium/drivers/swr/swr_public.h b/src/gallium/drivers/swr/swr_public.h
-new file mode 100644
-index 0000000..4d56ead
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_public.h
-@@ -0,0 +1,40 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_PUBLIC_H
-+#define SWR_PUBLIC_H
-+
-+struct pipe_screen;
-+struct sw_winsys;
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+struct pipe_screen *swr_create_screen(struct sw_winsys *winsys);
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp
-new file mode 100644
-index 0000000..2510b3a
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_query.cpp
-@@ -0,0 +1,334 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "pipe/p_defines.h"
-+#include "util/u_memory.h"
-+#include "os/os_time.h"
-+#include "swr_context.h"
-+#include "swr_fence.h"
-+#include "swr_query.h"
-+#include "swr_screen.h"
-+#include "swr_state.h"
-+
-+
-+static struct swr_query *
-+swr_query(struct pipe_query *p)
-+{
-+   return (struct swr_query *)p;
-+}
-+
-+static struct pipe_query *
-+swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
-+{
-+   struct swr_query *pq;
-+
-+   assert(type < PIPE_QUERY_TYPES);
-+   assert(index < MAX_SO_STREAMS);
-+
-+   pq = CALLOC_STRUCT(swr_query);
-+
-+   if (pq) {
-+      pq->type = type;
-+      pq->index = index;
-+   }
-+
-+   return (struct pipe_query *)pq;
-+}
-+
-+
-+static void
-+swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
-+{
-+   struct swr_query *pq = swr_query(q);
-+
-+   if (pq->fence) {
-+      if (!swr_is_fence_done(swr_fence(pq->fence))) {
-+         swr_fence_submit(swr_context(pipe), pq->fence);
-+         swr_fence_finish(pipe->screen, pq->fence, 0);
-+      }
-+      swr_fence_reference(pipe->screen, &pq->fence, NULL);
-+   }
-+
-+   FREE(pq);
-+}
-+
-+
-+// XXX Create a fence callback, rather than stalling SwrWaitForIdle
-+static void
-+swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   assert(pq->result);
-+   union pipe_query_result *result = pq->result;
-+   boolean enable_stats = pq->enable_stats;
-+   SWR_STATS swr_stats = {0};
-+
-+   if (pq->fence) {
-+      if (!swr_is_fence_done(swr_fence(pq->fence))) {
-+         swr_fence_submit(ctx, pq->fence);
-+         swr_fence_finish(pipe->screen, pq->fence, 0);
-+      }
-+      swr_fence_reference(pipe->screen, &pq->fence, NULL);
-+   }
-+
-+   /*
-+    * These queries don't need SWR Stats enabled in the core
-+    * Set and return.
-+    */
-+   switch (pq->type) {
-+   case PIPE_QUERY_TIMESTAMP:
-+   case PIPE_QUERY_TIME_ELAPSED:
-+      result->u64 = swr_get_timestamp(pipe->screen);
-+      return;
-+      break;
-+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-+      /* nothing to do here */
-+      return;
-+      break;
-+   case PIPE_QUERY_GPU_FINISHED:
-+      result->b = TRUE; /* XXX TODO Add an api func to SWR to compare drawId
-+                           vs LastRetiredId? */
-+      return;
-+      break;
-+   default:
-+      /* Any query that needs SwrCore stats */
-+      break;
-+   }
-+
-+   /*
-+    * All other results are collected from SwrCore counters
-+    */
-+
-+   /* XXX, Should turn this into a fence callback and skip the stall */
-+   SwrGetStats(ctx->swrContext, &swr_stats);
-+   /* SwrGetStats returns immediately, wait for collection */
-+   SwrWaitForIdle(ctx->swrContext);
-+
-+   switch (pq->type) {
-+   case PIPE_QUERY_OCCLUSION_PREDICATE:
-+   case PIPE_QUERY_OCCLUSION_COUNTER:
-+      result->u64 = swr_stats.DepthPassCount;
-+      break;
-+   case PIPE_QUERY_PRIMITIVES_GENERATED:
-+      result->u64 = swr_stats.IaPrimitives;
-+      break;
-+   case PIPE_QUERY_PRIMITIVES_EMITTED:
-+      result->u64 = swr_stats.SoNumPrimsWritten[pq->index];
-+      break;
-+   case PIPE_QUERY_SO_STATISTICS:
-+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
-+      struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
-+      so_stats->num_primitives_written =
-+         swr_stats.SoNumPrimsWritten[pq->index];
-+      so_stats->primitives_storage_needed =
-+         swr_stats.SoPrimStorageNeeded[pq->index];
-+   } break;
-+   case PIPE_QUERY_PIPELINE_STATISTICS: {
-+      struct pipe_query_data_pipeline_statistics *p_stats =
-+         &result->pipeline_statistics;
-+      p_stats->ia_vertices = swr_stats.IaVertices;
-+      p_stats->ia_primitives = swr_stats.IaPrimitives;
-+      p_stats->vs_invocations = swr_stats.VsInvocations;
-+      p_stats->gs_invocations = swr_stats.GsInvocations;
-+      p_stats->gs_primitives = swr_stats.GsPrimitives;
-+      p_stats->c_invocations = swr_stats.CPrimitives;
-+      p_stats->c_primitives = swr_stats.CPrimitives;
-+      p_stats->ps_invocations = swr_stats.PsInvocations;
-+      p_stats->hs_invocations = swr_stats.HsInvocations;
-+      p_stats->ds_invocations = swr_stats.DsInvocations;
-+      p_stats->cs_invocations = swr_stats.CsInvocations;
-+   } break;
-+   default:
-+      assert(0 && "Unsupported query");
-+      break;
-+   }
-+
-+   /* Only change stat collection if there are no active queries */
-+   if (ctx->active_queries == 0)
-+      SwrEnableStats(ctx->swrContext, enable_stats);
-+}
-+
-+
-+static boolean
-+swr_get_query_result(struct pipe_context *pipe,
-+                     struct pipe_query *q,
-+                     boolean wait,
-+                     union pipe_query_result *result)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct swr_query *pq = swr_query(q);
-+
-+   if (pq->fence) {
-+      if (!swr_is_fence_done(swr_fence(pq->fence))) {
-+         swr_fence_submit(ctx, pq->fence);
-+         if (!wait)
-+            return FALSE;
-+         swr_fence_finish(pipe->screen, pq->fence, 0);
-+      }
-+      swr_fence_reference(pipe->screen, &pq->fence, NULL);
-+   }
-+
-+   /* XXX: Need to handle counter rollover */
-+
-+   switch (pq->type) {
-+   /* Booleans */
-+   case PIPE_QUERY_OCCLUSION_PREDICATE:
-+      result->b = pq->end.u64 != pq->start.u64 ? TRUE : FALSE;
-+      break;
-+   case PIPE_QUERY_GPU_FINISHED:
-+      result->b = pq->end.b;
-+      break;
-+   /* Counters */
-+   case PIPE_QUERY_OCCLUSION_COUNTER:
-+   case PIPE_QUERY_TIMESTAMP:
-+   case PIPE_QUERY_TIME_ELAPSED:
-+   case PIPE_QUERY_PRIMITIVES_GENERATED:
-+   case PIPE_QUERY_PRIMITIVES_EMITTED:
-+      result->u64 = pq->end.u64 - pq->start.u64;
-+      break;
-+   /* Structures */
-+   case PIPE_QUERY_SO_STATISTICS: {
-+      struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
-+      struct pipe_query_data_so_statistics *start = &pq->start.so_statistics;
-+      struct pipe_query_data_so_statistics *end = &pq->end.so_statistics;
-+      so_stats->num_primitives_written =
-+         end->num_primitives_written - start->num_primitives_written;
-+      so_stats->primitives_storage_needed =
-+         end->primitives_storage_needed - start->primitives_storage_needed;
-+   } break;
-+   case PIPE_QUERY_TIMESTAMP_DISJOINT: {
-+      /* os_get_time_nano returns nanoseconds */
-+      result->timestamp_disjoint.frequency = UINT64_C(1000000000);
-+      result->timestamp_disjoint.disjoint = FALSE;
-+   } break;
-+   case PIPE_QUERY_PIPELINE_STATISTICS: {
-+      struct pipe_query_data_pipeline_statistics *p_stats =
-+         &result->pipeline_statistics;
-+      struct pipe_query_data_pipeline_statistics *start =
-+         &pq->start.pipeline_statistics;
-+      struct pipe_query_data_pipeline_statistics *end =
-+         &pq->end.pipeline_statistics;
-+      p_stats->ia_vertices = end->ia_vertices - start->ia_vertices;
-+      p_stats->ia_primitives = end->ia_primitives - start->ia_primitives;
-+      p_stats->vs_invocations = end->vs_invocations - start->vs_invocations;
-+      p_stats->gs_invocations = end->gs_invocations - start->gs_invocations;
-+      p_stats->gs_primitives = end->gs_primitives - start->gs_primitives;
-+      p_stats->c_invocations = end->c_invocations - start->c_invocations;
-+      p_stats->c_primitives = end->c_primitives - start->c_primitives;
-+      p_stats->ps_invocations = end->ps_invocations - start->ps_invocations;
-+      p_stats->hs_invocations = end->hs_invocations - start->hs_invocations;
-+      p_stats->ds_invocations = end->ds_invocations - start->ds_invocations;
-+      p_stats->cs_invocations = end->cs_invocations - start->cs_invocations;
-+   } break;
-+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
-+      struct pipe_query_data_so_statistics *start = &pq->start.so_statistics;
-+      struct pipe_query_data_so_statistics *end = &pq->end.so_statistics;
-+      uint64_t num_primitives_written =
-+         end->num_primitives_written - start->num_primitives_written;
-+      uint64_t primitives_storage_needed =
-+         end->primitives_storage_needed - start->primitives_storage_needed;
-+      result->b = num_primitives_written > primitives_storage_needed;
-+   } break;
-+   default:
-+      assert(0 && "Unsupported query");
-+      break;
-+   }
-+
-+   return TRUE;
-+}
-+
-+static boolean
-+swr_begin_query(struct pipe_context *pipe, struct pipe_query *q)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct swr_query *pq = swr_query(q);
-+
-+   /* Initialize Results */
-+   memset(&pq->start, 0, sizeof(pq->start));
-+   memset(&pq->end, 0, sizeof(pq->end));
-+
-+   /* Gather start stats and enable SwrCore counters */
-+   pq->result = &pq->start;
-+   pq->enable_stats = TRUE;
-+   swr_gather_stats(pipe, pq);
-+   ctx->active_queries++;
-+
-+   /* override start timestamp to 0 for TIMESTAMP query */
-+   if (pq->type == PIPE_QUERY_TIMESTAMP)
-+      pq->start.u64 = 0;
-+
-+   return true;
-+}
-+
-+static void
-+swr_end_query(struct pipe_context *pipe, struct pipe_query *q)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct swr_query *pq = swr_query(q);
-+
-+   assert(ctx->active_queries
-+          && "swr_end_query, there are no active queries!");
-+   ctx->active_queries--;
-+
-+   /* Gather end stats and disable SwrCore counters */
-+   pq->result = &pq->end;
-+   pq->enable_stats = FALSE;
-+   swr_gather_stats(pipe, pq);
-+}
-+
-+
-+boolean
-+swr_check_render_cond(struct pipe_context *pipe)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   boolean b, wait;
-+   uint64_t result;
-+
-+   if (!ctx->render_cond_query)
-+      return TRUE; /* no query predicate, draw normally */
-+
-+   wait = (ctx->render_cond_mode == PIPE_RENDER_COND_WAIT
-+           || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
-+
-+   b = pipe->get_query_result(
-+      pipe, ctx->render_cond_query, wait, (union pipe_query_result *)&result);
-+   if (b)
-+      return (!result == ctx->render_cond_cond);
-+   else
-+      return TRUE;
-+}
-+
-+void
-+swr_query_init(struct pipe_context *pipe)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   pipe->create_query = swr_create_query;
-+   pipe->destroy_query = swr_destroy_query;
-+   pipe->begin_query = swr_begin_query;
-+   pipe->end_query = swr_end_query;
-+   pipe->get_query_result = swr_get_query_result;
-+
-+   ctx->active_queries = 0;
-+}
-diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h
-new file mode 100644
-index 0000000..2a2aeee
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_query.h
-@@ -0,0 +1,48 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_QUERY_H
-+#define SWR_QUERY_H
-+
-+
-+#include <limits.h>
-+#include "os/os_thread.h"
-+
-+
-+struct swr_query {
-+   unsigned type; /* PIPE_QUERY_* */
-+   unsigned index;
-+
-+   union pipe_query_result *result;
-+   union pipe_query_result start;
-+   union pipe_query_result end;
-+
-+   struct pipe_fence_handle *fence;
-+
-+   boolean enable_stats;
-+};
-+
-+extern void swr_query_init(struct pipe_context *pipe);
-+
-+extern boolean swr_check_render_cond(struct pipe_context *pipe);
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h
-new file mode 100644
-index 0000000..f7f641e
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_resource.h
-@@ -0,0 +1,98 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_RESOURCE_H
-+#define SWR_RESOURCE_H
-+
-+#include "pipe/p_state.h"
-+#include "api.h"
-+
-+struct sw_displaytarget;
-+
-+struct swr_resource {
-+   struct pipe_resource base;
-+
-+   bool has_depth;
-+   bool has_stencil;
-+
-+   UINT alignedWidth;
-+   UINT alignedHeight;
-+
-+   SWR_SURFACE_STATE swr;
-+   SWR_SURFACE_STATE secondary; // for faking depth/stencil merged formats
-+
-+   struct sw_displaytarget *display_target;
-+
-+   unsigned row_stride[PIPE_MAX_TEXTURE_LEVELS];
-+   unsigned img_stride[PIPE_MAX_TEXTURE_LEVELS];
-+   unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
-+
-+   /* Opaque pointer to swr_context to mark resource in use */
-+   void *bound_to_context;
-+};
-+
-+
-+static INLINE struct swr_resource *
-+swr_resource(struct pipe_resource *resource)
-+{
-+   return (struct swr_resource *)resource;
-+}
-+
-+static INLINE boolean
-+swr_resource_is_texture(const struct pipe_resource *resource)
-+{
-+   switch (resource->target) {
-+   case PIPE_BUFFER:
-+      return FALSE;
-+   case PIPE_TEXTURE_1D:
-+   case PIPE_TEXTURE_1D_ARRAY:
-+   case PIPE_TEXTURE_2D:
-+   case PIPE_TEXTURE_2D_ARRAY:
-+   case PIPE_TEXTURE_RECT:
-+   case PIPE_TEXTURE_3D:
-+   case PIPE_TEXTURE_CUBE:
-+   case PIPE_TEXTURE_CUBE_ARRAY:
-+      return TRUE;
-+   default:
-+      assert(0);
-+      return FALSE;
-+   }
-+}
-+
-+
-+static INLINE void *
-+swr_resource_data(struct pipe_resource *resource)
-+{
-+   struct swr_resource *swr_r = swr_resource(resource);
-+
-+   assert(!swr_resource_is_texture(resource));
-+
-+   return swr_r->swr.pBaseAddress;
-+}
-+
-+
-+void swr_store_render_target(struct swr_context *ctx,
-+                             uint32_t attachment,
-+                             enum SWR_TILE_STATE post_tile_state,
-+                             struct SWR_SURFACE_STATE *surface = nullptr);
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp
-new file mode 100644
-index 0000000..e6c448c
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_scratch.cpp
-@@ -0,0 +1,116 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "util/u_memory.h"
-+#include "swr_context.h"
-+#include "swr_scratch.h"
-+#include "api.h"
-+
-+
-+void *
-+swr_copy_to_scratch_space(struct swr_context *ctx,
-+                          struct swr_scratch_space *space,
-+                          const void *user_buffer,
-+                          unsigned int size)
-+{
-+   void *ptr;
-+   assert(space);
-+   assert(user_buffer);
-+   assert(size);
-+
-+   if (size >= 2048) { /* XXX TODO create KNOB_ for this */
-+      /* Use per draw SwrAllocDrawContextMemory for larger copies */
-+      ptr = SwrAllocDrawContextMemory(ctx->swrContext, size, 4);
-+   } else {
-+      /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */
-+      unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT;
-+
-+      /* Need to grow space */
-+      if (max_size_in_flight > space->current_size) {
-+         /* Must idle the pipeline, this is infrequent */
-+         SwrWaitForIdle(ctx->swrContext);
-+
-+         space->current_size = max_size_in_flight;
-+
-+         if (space->base) {
-+            align_free(space->base);
-+            space->base = NULL;
-+         }
-+
-+         if (!space->base) {
-+            space->base = (BYTE *)align_malloc(space->current_size, 4);
-+            space->head = (void *)space->base;
-+         }
-+      }
-+
-+      /* Wrap */
-+      if (((BYTE *)space->head + size)
-+          >= ((BYTE *)space->base + space->current_size)) {
-+         /*
-+          * TODO XXX: Should add a fence on wrap.  Assumption is that
-+          * current_space >> size, and there are at least MAX_DRAWS_IN_FLIGHT
-+          * draws in scratch.  So fence would always be met on wrap.  A fence
-+          * would ensure that first frame in buffer is done before wrapping.
-+          * If fence ever needs to be waited on, can increase buffer size.
-+          * So far in testing, this hasn't been necessary.
-+          */
-+         space->head = space->base;
-+      }
-+
-+      ptr = space->head;
-+      space->head = (BYTE *)space->head + size;
-+   }
-+
-+   /* Copy user_buffer to scratch */
-+   memcpy(ptr, user_buffer, size);
-+
-+   return ptr;
-+}
-+
-+
-+void
-+swr_init_scratch_buffers(struct swr_context *ctx)
-+{
-+   struct swr_scratch_buffers *scratch;
-+
-+   scratch = CALLOC_STRUCT(swr_scratch_buffers);
-+   ctx->scratch = scratch;
-+}
-+
-+void
-+swr_destroy_scratch_buffers(struct swr_context *ctx)
-+{
-+   struct swr_scratch_buffers *scratch = ctx->scratch;
-+
-+   if (scratch) {
-+      if (scratch->vs_constants.base)
-+         align_free(scratch->vs_constants.base);
-+      if (scratch->fs_constants.base)
-+         align_free(scratch->fs_constants.base);
-+      if (scratch->vertex_buffer.base)
-+         align_free(scratch->vertex_buffer.base);
-+      if (scratch->index_buffer.base)
-+         align_free(scratch->index_buffer.base);
-+      FREE(scratch);
-+   }
-+}
-diff --git a/src/gallium/drivers/swr/swr_scratch.h b/src/gallium/drivers/swr/swr_scratch.h
-new file mode 100644
-index 0000000..74218d6
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_scratch.h
-@@ -0,0 +1,63 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_SCRATCH_H
-+#define SWR_SCRATCH_H
-+
-+struct swr_scratch_space {
-+   void *head;
-+   unsigned int current_size;
-+   /* TODO XXX: Add a fence for wrap condition. */
-+
-+   void *base;
-+};
-+
-+struct swr_scratch_buffers {
-+   struct swr_scratch_space vs_constants;
-+   struct swr_scratch_space fs_constants;
-+   struct swr_scratch_space vertex_buffer;
-+   struct swr_scratch_space index_buffer;
-+};
-+
-+
-+/*
-+ * swr_copy_to_scratch_space
-+ * Copies size bytes of user_buffer into the scratch ring buffer.
-+ * Used to store temporary data such as client arrays and constants.
-+ *
-+ * Inputs:
-+ *   space ptr to scratch pool (vs_constants, fs_constants)
-+ *   user_buffer, data to copy into scratch space
-+ *   size to be copied
-+ * Returns:
-+ *   pointer to data copied to scratch space.
-+ */
-+void *swr_copy_to_scratch_space(struct swr_context *ctx,
-+                                struct swr_scratch_space *space,
-+                                const void *user_buffer,
-+                                unsigned int size);
-+
-+void swr_init_scratch_buffers(struct swr_context *ctx);
-+void swr_destroy_scratch_buffers(struct swr_context *ctx);
-+
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
-new file mode 100644
-index 0000000..66eb58b
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_screen.cpp
-@@ -0,0 +1,666 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "pipe/p_screen.h"
-+#include "pipe/p_defines.h"
-+#include "util/u_memory.h"
-+#include "util/u_format.h"
-+#include "util/u_inlines.h"
-+#include "util/u_cpu_detect.h"
-+
-+#include "state_tracker/sw_winsys.h"
-+
-+extern "C" {
-+#include "gallivm/lp_bld_limits.h"
-+}
-+
-+#include "swr_public.h"
-+#include "swr_screen.h"
-+#include "swr_context.h"
-+#include "swr_resource.h"
-+#include "swr_fence.h"
-+#include "gen_knobs.h"
-+
-+#include "jit_api.h"
-+
-+#include <stdio.h>
-+
-+static const char *
-+swr_get_name(struct pipe_screen *screen)
-+{
-+   return "SWR";
-+}
-+
-+static const char *
-+swr_get_vendor(struct pipe_screen *screen)
-+{
-+   return "Intel Corporation";
-+}
-+
-+static boolean
-+swr_is_format_supported(struct pipe_screen *screen,
-+                        enum pipe_format format,
-+                        enum pipe_texture_target target,
-+                        unsigned sample_count,
-+                        unsigned bind)
-+{
-+   struct sw_winsys *winsys = swr_screen(screen)->winsys;
-+   const struct util_format_description *format_desc;
-+
-+   assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D
-+          || target == PIPE_TEXTURE_1D_ARRAY
-+          || target == PIPE_TEXTURE_2D
-+          || target == PIPE_TEXTURE_2D_ARRAY
-+          || target == PIPE_TEXTURE_RECT
-+          || target == PIPE_TEXTURE_3D
-+          || target == PIPE_TEXTURE_CUBE
-+          || target == PIPE_TEXTURE_CUBE_ARRAY);
-+
-+   format_desc = util_format_description(format);
-+   if (!format_desc)
-+      return FALSE;
-+
-+   if (sample_count > 1)
-+      return FALSE;
-+
-+   if (bind
-+       & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) {
-+      if (!winsys->is_displaytarget_format_supported(winsys, bind, format))
-+         return FALSE;
-+   }
-+
-+   if (bind & PIPE_BIND_RENDER_TARGET) {
-+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
-+         return FALSE;
-+
-+      if (mesa_to_swr_format(format) == (SWR_FORMAT)-1)
-+         return FALSE;
-+
-+      /*
-+       * Although possible, it is unnatural to render into compressed or YUV
-+       * surfaces. So disable these here to avoid going into weird paths
-+       * inside the state trackers.
-+       */
-+      if (format_desc->block.width != 1 || format_desc->block.height != 1)
-+         return FALSE;
-+   }
-+
-+   /* We're going to lie and say we support all depth/stencil formats.
-+    * SWR actually needs separate bindings, and only does F32 depth.
-+    */
-+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
-+      if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
-+         return FALSE;
-+   }
-+
-+   return TRUE;
-+}
-+
-+static int
-+swr_get_param(struct pipe_screen *screen, enum pipe_cap param)
-+{
-+   switch (param) {
-+   case PIPE_CAP_NPOT_TEXTURES:
-+   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
-+      return 1;
-+   case PIPE_CAP_TWO_SIDED_STENCIL:
-+      return 1;
-+   case PIPE_CAP_SM3:
-+      return 1;
-+   case PIPE_CAP_ANISOTROPIC_FILTER:
-+      return 0;
-+   case PIPE_CAP_POINT_SPRITE:
-+      return 1;
-+   case PIPE_CAP_MAX_RENDER_TARGETS:
-+      return PIPE_MAX_COLOR_BUFS;
-+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
-+      return 1;
-+   case PIPE_CAP_OCCLUSION_QUERY:
-+   case PIPE_CAP_QUERY_TIME_ELAPSED:
-+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
-+      return 1;
-+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-+      return 1;
-+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
-+      return 1;
-+   case PIPE_CAP_TEXTURE_SWIZZLE:
-+      return 1;
-+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
-+      return 0;
-+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-+      return 13; // xxx This increases rendertarget max size to 4k x 4k.  No
-+                 // way to separate widht/height.
-+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-+      return 12; // xxx
-+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-+      return 12; // xxx
-+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
-+      return 1;
-+   case PIPE_CAP_INDEP_BLEND_ENABLE:
-+      return 1;
-+   case PIPE_CAP_INDEP_BLEND_FUNC:
-+      return 1;
-+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
-+      return 0; // Don't support lower left frag coord.
-+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
-+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
-+      return 1;
-+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
-+      return 1;
-+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-+      return MAX_SO_STREAMS;
-+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-+      return MAX_ATTRIBUTES;
-+   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
-+   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
-+      return 1024;
-+   case PIPE_CAP_MAX_VERTEX_STREAMS:
-+      return 1;
-+   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
-+      return 2048;
-+   case PIPE_CAP_PRIMITIVE_RESTART:
-+      return 1;
-+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
-+      return 1;
-+   case PIPE_CAP_TGSI_INSTANCEID:
-+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
-+   case PIPE_CAP_START_INSTANCE:
-+      return 1;
-+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
-+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-+      return 1;
-+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-+      return 256; /* for GL3 */
-+   case PIPE_CAP_MIN_TEXEL_OFFSET:
-+      return -8;
-+   case PIPE_CAP_MAX_TEXEL_OFFSET:
-+      return 7;
-+   case PIPE_CAP_CONDITIONAL_RENDER:
-+      return 1;
-+   case PIPE_CAP_TEXTURE_BARRIER:
-+      return 0;
-+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
-+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: /* draw module */
-+   case PIPE_CAP_VERTEX_COLOR_CLAMPED: /* draw module */
-+      return 1;
-+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
-+      return 0;
-+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
-+      return 330;
-+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
-+      return 0;
-+   case PIPE_CAP_COMPUTE:
-+      return 0;
-+   case PIPE_CAP_USER_VERTEX_BUFFERS:
-+   case PIPE_CAP_USER_INDEX_BUFFERS:
-+   case PIPE_CAP_USER_CONSTANT_BUFFERS:
-+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
-+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
-+      return 1;
-+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
-+      return 16;
-+   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
-+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
-+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
-+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
-+      return 0;
-+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
-+      return 64;
-+   case PIPE_CAP_QUERY_TIMESTAMP:
-+      return 1;
-+   case PIPE_CAP_CUBE_MAP_ARRAY:
-+      return 0;
-+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
-+      return 1;
-+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
-+      return 65536;
-+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
-+      return 0;
-+   case PIPE_CAP_TGSI_TEXCOORD:
-+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
-+      return 0;
-+   case PIPE_CAP_MAX_VIEWPORTS:
-+      return 1;
-+   case PIPE_CAP_ENDIANNESS:
-+      return PIPE_ENDIAN_NATIVE;
-+   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-+   case PIPE_CAP_TEXTURE_GATHER_SM5:
-+      return 0;
-+   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
-+      return 1;
-+   case PIPE_CAP_TEXTURE_QUERY_LOD:
-+   case PIPE_CAP_SAMPLE_SHADING:
-+   case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-+   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
-+      return 0;
-+   case PIPE_CAP_FAKE_SW_MSAA:
-+      return 1;
-+   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
-+   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
-+      return 0;
-+   case PIPE_CAP_DRAW_INDIRECT:
-+      return 1;
-+
-+   case PIPE_CAP_VENDOR_ID:
-+      return 0xFFFFFFFF;
-+   case PIPE_CAP_DEVICE_ID:
-+      return 0xFFFFFFFF;
-+   case PIPE_CAP_ACCELERATED:
-+      return 0;
-+   case PIPE_CAP_VIDEO_MEMORY: {
-+      /* XXX: Do we want to return the full amount of system memory ? */
-+      uint64_t system_memory;
-+
-+      if (!os_get_total_physical_memory(&system_memory))
-+         return 0;
-+
-+      return (int)(system_memory >> 20);
-+   }
-+   case PIPE_CAP_UMA:
-+      return 1;
-+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-+      return 1;
-+   case PIPE_CAP_CLIP_HALFZ:
-+      return 1;
-+   case PIPE_CAP_VERTEXID_NOBASE:
-+      return 0;
-+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
-+      return 1;
-+   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
-+      return 0;
-+   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
-+      return 0; // xxx
-+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-+      return 0;
-+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-+      return 0;
-+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
-+      return 0; // xxx
-+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
-+      return 1;
-+   }
-+
-+   /* should only get here on unhandled cases */
-+   debug_printf("Unexpected PIPE_CAP %d query\n", param);
-+   return 0;
-+}
-+
-+static int
-+swr_get_shader_param(struct pipe_screen *screen,
-+                     unsigned shader,
-+                     enum pipe_shader_cap param)
-+{
-+   if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_FRAGMENT)
-+      return gallivm_get_shader_param(param);
-+
-+   // Todo: geometry, tesselation, compute
-+   return 0;
-+}
-+
-+
-+static float
-+swr_get_paramf(struct pipe_screen *screen, enum pipe_capf param)
-+{
-+   switch (param) {
-+   case PIPE_CAPF_MAX_LINE_WIDTH:
-+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
-+   case PIPE_CAPF_MAX_POINT_WIDTH:
-+      return 255.0; /* arbitrary */
-+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
-+      return 0.0;
-+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
-+      return 0.0;
-+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
-+      return 0.0;
-+   case PIPE_CAPF_GUARD_BAND_LEFT:
-+   case PIPE_CAPF_GUARD_BAND_TOP:
-+   case PIPE_CAPF_GUARD_BAND_RIGHT:
-+   case PIPE_CAPF_GUARD_BAND_BOTTOM:
-+      return 0.0;
-+   }
-+   /* should only get here on unhandled cases */
-+   debug_printf("Unexpected PIPE_CAPF %d query\n", param);
-+   return 0.0;
-+}
-+
-+SWR_FORMAT
-+mesa_to_swr_format(enum pipe_format format)
-+{
-+   const struct util_format_description *format_desc =
-+      util_format_description(format);
-+   if (!format_desc)
-+      return (SWR_FORMAT)-1;
-+
-+   // more robust check would be comparing all attributes of the formats
-+   // luckily format names are mostly standardized
-+   for (int i = 0; i < NUM_SWR_FORMATS; i++) {
-+      const SWR_FORMAT_INFO &swr_desc = GetFormatInfo((SWR_FORMAT)i);
-+
-+      if (!strcasecmp(format_desc->short_name, swr_desc.name))
-+         return (SWR_FORMAT)i;
-+   }
-+
-+   // ... with some exceptions
-+   switch (format) {
-+   case PIPE_FORMAT_R8G8B8A8_SRGB:
-+      return R8G8B8A8_UNORM_SRGB;
-+   case PIPE_FORMAT_B8G8R8A8_SRGB:
-+      return B8G8R8A8_UNORM_SRGB;
-+   case PIPE_FORMAT_I8_UNORM:
-+      return R8_UNORM;
-+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-+      return R24_UNORM_X8_TYPELESS;
-+   case PIPE_FORMAT_L8A8_UNORM:
-+      return R8G8_UNORM;
-+   default:
-+      break;
-+   }
-+
-+   debug_printf("asked to convert unsupported format %s\n",
-+                format_desc->name);
-+   return (SWR_FORMAT)-1;
-+}
-+
-+static boolean
-+swr_displaytarget_layout(struct swr_screen *screen, struct swr_resource *res)
-+{
-+   struct sw_winsys *winsys = screen->winsys;
-+
-+   UINT stride;
-+   res->display_target = winsys->displaytarget_create(winsys,
-+                                                      res->base.bind,
-+                                                      res->base.format,
-+                                                      res->alignedWidth,
-+                                                      res->alignedHeight,
-+                                                      64,
-+                                                      &stride);
-+
-+   if (res->display_target == NULL)
-+      return FALSE;
-+
-+   /* Clear the display target surface */
-+   void *map = winsys->displaytarget_map(
-+      winsys, res->display_target, PIPE_TRANSFER_WRITE);
-+
-+   if (map)
-+      memset(map, 0, res->alignedHeight * stride);
-+
-+   winsys->displaytarget_unmap(winsys, res->display_target);
-+
-+   return TRUE;
-+}
-+
-+static struct pipe_resource *
-+swr_resource_create(struct pipe_screen *_screen,
-+                    const struct pipe_resource *templat)
-+{
-+   struct swr_screen *screen = swr_screen(_screen);
-+   struct swr_resource *res = CALLOC_STRUCT(swr_resource);
-+   if (!res)
-+      return NULL;
-+
-+   res->base = *templat;
-+   pipe_reference_init(&res->base.reference, 1);
-+   res->base.screen = &screen->base;
-+
-+   const struct util_format_description *desc =
-+      util_format_description(templat->format);
-+   res->has_depth = util_format_has_depth(desc);
-+   res->has_stencil = util_format_has_stencil(desc);
-+
-+   pipe_format fmt = templat->format;
-+   if (res->has_depth)
-+      fmt = PIPE_FORMAT_Z24_UNORM_S8_UINT;
-+   if (res->has_stencil && !res->has_depth)
-+      fmt = PIPE_FORMAT_R8_UINT;
-+
-+   res->swr.width = templat->width0;
-+   res->swr.height = templat->height0;
-+   res->swr.depth = templat->depth0;
-+   res->swr.type = SURFACE_2D;
-+   res->swr.tileMode = SWR_TILE_NONE;
-+   res->swr.format = mesa_to_swr_format(fmt);
-+   res->swr.numSamples = (1 << templat->nr_samples);
-+
-+   SWR_FORMAT_INFO finfo = GetFormatInfo(res->swr.format);
-+
-+   unsigned total_size = 0;
-+   unsigned width = templat->width0;
-+   unsigned height = templat->height0;
-+   unsigned depth = templat->depth0;
-+   unsigned layers = templat->array_size;
-+
-+   for (int level = 0; level <= templat->last_level; level++) {
-+      unsigned alignedWidth, alignedHeight;
-+      unsigned num_slices;
-+
-+      if (templat->bind & (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET
-+                           | PIPE_BIND_DISPLAY_TARGET)) {
-+         alignedWidth = (width + (KNOB_MACROTILE_X_DIM - 1))
-+            & ~(KNOB_MACROTILE_X_DIM - 1);
-+         alignedHeight = (height + (KNOB_MACROTILE_Y_DIM - 1))
-+            & ~(KNOB_MACROTILE_Y_DIM - 1);
-+      } else {
-+         alignedWidth = width;
-+         alignedHeight = height;
-+      }
-+
-+      if (level == 0) {
-+         res->alignedWidth = alignedWidth;
-+         res->alignedHeight = alignedHeight;
-+      }
-+
-+      res->row_stride[level] = alignedWidth * finfo.Bpp;
-+      res->img_stride[level] = res->row_stride[level] * alignedHeight;
-+      res->mip_offsets[level] = total_size;
-+
-+      if (templat->target == PIPE_TEXTURE_3D)
-+         num_slices = depth;
-+      else if (templat->target == PIPE_TEXTURE_1D_ARRAY
-+               || templat->target == PIPE_TEXTURE_2D_ARRAY
-+               || templat->target == PIPE_TEXTURE_CUBE
-+               || templat->target == PIPE_TEXTURE_CUBE_ARRAY)
-+         num_slices = layers;
-+      else
-+         num_slices = 1;
-+
-+      total_size += res->img_stride[level] * num_slices;
-+
-+      width = u_minify(width, 1);
-+      height = u_minify(height, 1);
-+      depth = u_minify(depth, 1);
-+   }
-+
-+   res->swr.halign = res->alignedWidth;
-+   res->swr.valign = res->alignedHeight;
-+   res->swr.pitch = res->row_stride[0];
-+   res->swr.pBaseAddress = (BYTE *)_aligned_malloc(total_size, 64);
-+
-+   if (res->has_depth && res->has_stencil) {
-+      res->secondary.width = templat->width0;
-+      res->secondary.height = templat->height0;
-+      res->secondary.depth = templat->depth0;
-+      res->secondary.type = SURFACE_2D;
-+      res->secondary.tileMode = SWR_TILE_NONE;
-+      res->secondary.format = R8_UINT;
-+      res->secondary.numSamples = (1 << templat->nr_samples);
-+
-+      SWR_FORMAT_INFO finfo = GetFormatInfo(res->secondary.format);
-+      res->secondary.pitch = res->alignedWidth * finfo.Bpp;
-+      res->secondary.pBaseAddress = (BYTE *)_aligned_malloc(
-+         res->alignedHeight * res->secondary.pitch, 64);
-+   }
-+
-+   if (swr_resource_is_texture(&res->base)) {
-+      if (res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT
-+                            | PIPE_BIND_SHARED)) {
-+         /* displayable surface */
-+         if (!swr_displaytarget_layout(screen, res))
-+            goto fail;
-+      }
-+   }
-+
-+   return &res->base;
-+
-+fail:
-+   FREE(res);
-+   return NULL;
-+}
-+
-+static void
-+swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt)
-+{
-+   struct swr_screen *screen = swr_screen(p_screen);
-+   struct swr_resource *res = swr_resource(pt);
-+
-+   /*
-+    * If this resource is attached to a context it may still be in use, check
-+    * dependencies before freeing
-+    * XXX TODO: don't use SwrWaitForIdle, use fences and come up with a real
-+    * resource manager.
-+    * XXX It's happened that we get a swr_destroy prior to freeing the
-+    * framebuffer resource.  Don't wait on it.
-+    */
-+   if (res->bound_to_context && !res->display_target) {
-+      struct swr_context *ctx =
-+         swr_context((pipe_context *)res->bound_to_context);
-+      SwrWaitForIdle(
-+         ctx->swrContext); // BMCDEBUG, don't SwrWaitForIdle!!! Use a fence.
-+   }
-+
-+   if (res->display_target) {
-+      /* display target */
-+      struct sw_winsys *winsys = screen->winsys;
-+      winsys->displaytarget_destroy(winsys, res->display_target);
-+   }
-+
-+   _aligned_free(res->swr.pBaseAddress);
-+   _aligned_free(res->secondary.pBaseAddress);
-+
-+   FREE(res);
-+}
-+
-+
-+static void
-+swr_flush_frontbuffer(struct pipe_screen *p_screen,
-+                      struct pipe_resource *resource,
-+                      unsigned level,
-+                      unsigned layer,
-+                      void *context_private,
-+                      struct pipe_box *sub_box)
-+{
-+   SWR_SURFACE_STATE &colorBuffer = swr_resource(resource)->swr;
-+
-+   struct swr_screen *screen = swr_screen(p_screen);
-+   struct sw_winsys *winsys = screen->winsys;
-+   struct swr_resource *res = swr_resource(resource);
-+
-+   /* Ensure fence set at flush is finished, before reading frame buffer */
-+   swr_fence_finish(p_screen, screen->flush_fence, 0);
-+
-+   void *map = winsys->displaytarget_map(
-+      winsys, res->display_target, PIPE_TRANSFER_WRITE);
-+   memcpy(
-+      map, colorBuffer.pBaseAddress, colorBuffer.pitch * colorBuffer.height);
-+   winsys->displaytarget_unmap(winsys, res->display_target);
-+
-+   assert(res->display_target);
-+   if (res->display_target)
-+      winsys->displaytarget_display(
-+         winsys, res->display_target, context_private, sub_box);
-+}
-+
-+
-+static void
-+swr_destroy_screen(struct pipe_screen *p_screen)
-+{
-+   struct swr_screen *screen = swr_screen(p_screen);
-+   struct sw_winsys *winsys = screen->winsys;
-+
-+   fprintf(stderr, "SWR destroy screen!\n");
-+
-+   swr_fence_finish(p_screen, screen->flush_fence, 0);
-+   swr_fence_reference(p_screen, &screen->flush_fence, NULL);
-+
-+   JitDestroyContext(screen->hJitMgr);
-+
-+   if (winsys->destroy)
-+      winsys->destroy(winsys);
-+
-+   FREE(screen);
-+}
-+
-+
-+struct pipe_screen *
-+swr_create_screen(struct sw_winsys *winsys)
-+{
-+   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
-+
-+   if (!screen)
-+      return NULL;
-+
-+   fprintf(stderr, "SWR create screen!\n");
-+   util_cpu_detect();
-+   if (util_cpu_caps.has_avx2)
-+      fprintf(stderr, "This processor supports AVX2.\n");
-+   else if (util_cpu_caps.has_avx)
-+      fprintf(stderr, "This processor supports AVX.\n");
-+   /* Exit gracefully if there is no AVX support */
-+   else {
-+      fprintf(stderr, " !!! This processor does not support AVX or AVX2.  "
-+                      "OpenSWR requires AVX.\n");
-+      exit(-1);
-+   }
-+
-+   if (!getenv("KNOB_MAX_PRIMS_PER_DRAW")) {
-+      g_GlobalKnobs.MAX_PRIMS_PER_DRAW.Value(49152);
-+   }
-+
-+   screen->winsys = winsys;
-+   screen->base.get_name = swr_get_name;
-+   screen->base.get_vendor = swr_get_vendor;
-+   screen->base.is_format_supported = swr_is_format_supported;
-+   screen->base.context_create = swr_create_context;
-+
-+   screen->base.destroy = swr_destroy_screen;
-+   screen->base.get_param = swr_get_param;
-+   screen->base.get_shader_param = swr_get_shader_param;
-+   screen->base.get_paramf = swr_get_paramf;
-+
-+   screen->base.resource_create = swr_resource_create;
-+   screen->base.resource_destroy = swr_resource_destroy;
-+
-+   screen->base.flush_frontbuffer = swr_flush_frontbuffer;
-+
-+   screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, KNOB_ARCH_STR);
-+
-+   swr_fence_init(&screen->base);
-+
-+   return &screen->base;
-+}
-diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
-new file mode 100644
-index 0000000..a96dc44
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_screen.h
-@@ -0,0 +1,52 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_SCREEN_H
-+#define SWR_SCREEN_H
-+
-+#include "pipe/p_screen.h"
-+#include "pipe/p_defines.h"
-+#include "api.h"
-+
-+struct sw_winsys;
-+
-+struct swr_screen {
-+   struct pipe_screen base;
-+
-+   struct pipe_fence_handle *flush_fence;
-+
-+   struct sw_winsys *winsys;
-+
-+   HANDLE hJitMgr;
-+};
-+
-+static INLINE struct swr_screen *
-+swr_screen(struct pipe_screen *pipe)
-+{
-+   return (struct swr_screen *)pipe;
-+}
-+
-+SWR_FORMAT
-+mesa_to_swr_format(enum pipe_format format);
-+
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
-new file mode 100644
-index 0000000..edad4c2
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_shader.cpp
-@@ -0,0 +1,608 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "JitManager.h"
-+#include "state.h"
-+#include "state_llvm.h"
-+#include "builder.h"
-+
-+#include "llvm-c/Core.h"
-+#include "llvm/Support/CBindingWrapping.h"
-+
-+#include "tgsi/tgsi_strings.h"
-+#include "gallivm/lp_bld_init.h"
-+#include "gallivm/lp_bld_flow.h"
-+#include "gallivm/lp_bld_struct.h"
-+#include "gallivm/lp_bld_tgsi.h"
-+
-+#include "swr_context.h"
-+#include "swr_context_llvm.h"
-+#include "swr_state.h"
-+#include "swr_screen.h"
-+
-+bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs)
-+{
-+   return !memcmp(&lhs, &rhs, sizeof(lhs));
-+}
-+
-+void
-+swr_generate_fs_key(struct swr_jit_key &key,
-+                    struct swr_context *ctx,
-+                    swr_fragment_shader *swr_fs)
-+{
-+   key.nr_cbufs = ctx->framebuffer.nr_cbufs;
-+   key.light_twoside = ctx->rasterizer->light_twoside;
-+   memcpy(&key.vs_output_semantic_name,
-+          &ctx->vs->info.base.output_semantic_name,
-+          sizeof(key.vs_output_semantic_name));
-+   memcpy(&key.vs_output_semantic_idx,
-+          &ctx->vs->info.base.output_semantic_index,
-+          sizeof(key.vs_output_semantic_idx));
-+
-+   key.nr_samplers = swr_fs->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
-+
-+   for (unsigned i = 0; i < key.nr_samplers; i++) {
-+      if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
-+         lp_sampler_static_sampler_state(
-+            &key.sampler[i].sampler_state,
-+            ctx->samplers[PIPE_SHADER_FRAGMENT][i]);
-+      }
-+   }
-+
-+   /*
-+    * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
-+    * are dx10-style? Can't really have mixed opcodes, at least not
-+    * if we want to skip the holes here (without rescanning tgsi).
-+    */
-+   if (swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
-+      key.nr_sampler_views =
-+         swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
-+      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-+         if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
-+            lp_sampler_static_texture_state(
-+               &key.sampler[i].texture_state,
-+               ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]);
-+         }
-+      }
-+   } else {
-+      key.nr_sampler_views = key.nr_samplers;
-+      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-+         if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
-+            lp_sampler_static_texture_state(
-+               &key.sampler[i].texture_state,
-+               ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]);
-+         }
-+      }
-+   }
-+
-+   memcpy(&key.alphaTest,
-+          &ctx->depth_stencil->alpha,
-+          sizeof(struct pipe_alpha_state));
-+}
-+
-+struct BuilderSWR : public Builder {
-+   BuilderSWR(JitManager *pJitMgr)
-+      : Builder(pJitMgr)
-+   {
-+      pJitMgr->SetupNewModule();
-+   }
-+
-+   PFN_VERTEX_FUNC
-+   CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs);
-+   PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_key &key);
-+};
-+
-+PFN_VERTEX_FUNC
-+BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
-+{
-+   swr_vs->linkageMask = 0;
-+
-+   for (unsigned i = 0; i < swr_vs->info.base.num_outputs; i++) {
-+      switch (swr_vs->info.base.output_semantic_name[i]) {
-+      case TGSI_SEMANTIC_POSITION:
-+         break;
-+      case TGSI_SEMANTIC_PSIZE:
-+         swr_vs->pointSizeAttrib = i;
-+         break;
-+      default:
-+         swr_vs->linkageMask |= (1 << i);
-+         break;
-+      }
-+   }
-+
-+   //   tgsi_dump(swr_vs->pipe.tokens, 0);
-+
-+   struct gallivm_state *gallivm =
-+      gallivm_create("VS", wrap(&JM()->mContext));
-+   gallivm->module = wrap(JM()->mpCurrentModule);
-+
-+   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-+
-+   memset(outputs, 0, sizeof(outputs));
-+
-+   AttrBuilder attrBuilder;
-+   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-+   AttributeSet attrSet = AttributeSet::get(
-+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
-+
-+   std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
-+                              PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)};
-+   FunctionType *vsFuncType =
-+      FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false);
-+
-+   // create new vertex shader function
-+   auto pFunction = Function::Create(vsFuncType,
-+                                     GlobalValue::ExternalLinkage,
-+                                     "VS",
-+                                     JM()->mpCurrentModule);
-+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
-+
-+   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
-+   IRB()->SetInsertPoint(block);
-+   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
-+
-+   auto argitr = pFunction->getArgumentList().begin();
-+   Value *hPrivateData = argitr++;
-+   hPrivateData->setName("hPrivateData");
-+   Value *pVsCtx = argitr++;
-+   pVsCtx->setName("vsCtx");
-+
-+   Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantVS});
-+   consts_ptr->setName("vs_constants");
-+   Value *const_sizes_ptr =
-+      GEP(hPrivateData, {0, swr_draw_context_num_constantsVS});
-+   const_sizes_ptr->setName("num_vs_constants");
-+
-+   Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin});
-+
-+   for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
-+      const unsigned mask = swr_vs->info.base.input_usage_mask[attrib];
-+      for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-+         if (mask & (1 << channel)) {
-+            inputs[attrib][channel] =
-+               wrap(LOAD(vtxInput, {0, 0, attrib, channel}));
-+         }
-+      }
-+   }
-+
-+   struct lp_bld_tgsi_system_values system_values;
-+   memset(&system_values, 0, sizeof(system_values));
-+   system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID}));
-+   system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID}));
-+
-+   lp_build_tgsi_soa(gallivm,
-+                     swr_vs->pipe.tokens,
-+                     lp_type_float_vec(32, 32 * 8),
-+                     NULL, // mask
-+                     wrap(consts_ptr),
-+                     wrap(const_sizes_ptr),
-+                     &system_values,
-+                     inputs,
-+                     outputs,
-+                     NULL, // wrap(hPrivateData), (sampler context)
-+                     NULL, // sampler
-+                     &swr_vs->info.base,
-+                     NULL); // geometry shader face
-+
-+   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-+
-+   Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout});
-+
-+   for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-+      for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) {
-+         if (!outputs[attrib][channel])
-+            continue;
-+
-+         Value *val = LOAD(unwrap(outputs[attrib][channel]));
-+         STORE(val, vtxOutput, {0, 0, attrib, channel});
-+      }
-+   }
-+
-+   RET_VOID();
-+
-+   gallivm_verify_function(gallivm, wrap(pFunction));
-+   gallivm_compile_module(gallivm);
-+
-+   //   lp_debug_dump_value(func);
-+
-+   PFN_VERTEX_FUNC pFunc =
-+      (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction));
-+
-+   debug_printf("vert shader  %p\n", pFunc);
-+   assert(pFunc && "Error: VertShader = NULL");
-+
-+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5)
-+   JM()->mIsModuleFinalized = true;
-+#endif
-+
-+   return pFunc;
-+}
-+
-+PFN_VERTEX_FUNC
-+swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
-+{
-+   BuilderSWR builder(
-+      reinterpret_cast<JitManager *>(swr_screen(ctx->screen)->hJitMgr));
-+   return builder.CompileVS(ctx, swr_vs);
-+}
-+
-+static unsigned
-+locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info)
-+{
-+   for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
-+      if ((info->output_semantic_name[i] == name)
-+          && (info->output_semantic_index[i] == index)) {
-+         return i - 1; // position is not part of the linkage
-+      }
-+   }
-+
-+   if (name == TGSI_SEMANTIC_COLOR) { // BCOLOR fallback
-+      for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
-+         if ((info->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR)
-+             && (info->output_semantic_index[i] == index)) {
-+            return i - 1; // position is not part of the linkage
-+         }
-+      }
-+   }
-+
-+   return 0xFFFFFFFF;
-+}
-+
-+PFN_PIXEL_KERNEL
-+BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key)
-+{
-+   struct swr_fragment_shader *swr_fs = ctx->fs;
-+
-+   //   tgsi_dump(swr_fs->pipe.tokens, 0);
-+
-+   struct gallivm_state *gallivm =
-+      gallivm_create("FS", wrap(&JM()->mContext));
-+   gallivm->module = wrap(JM()->mpCurrentModule);
-+
-+   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-+
-+   memset(inputs, 0, sizeof(inputs));
-+   memset(outputs, 0, sizeof(outputs));
-+
-+   struct lp_build_sampler_soa *sampler = NULL;
-+
-+   AttrBuilder attrBuilder;
-+   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
-+   AttributeSet attrSet = AttributeSet::get(
-+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
-+
-+   std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
-+                              PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)};
-+   FunctionType *funcType =
-+      FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false);
-+
-+   auto pFunction = Function::Create(funcType,
-+                                     GlobalValue::ExternalLinkage,
-+                                     "FS",
-+                                     JM()->mpCurrentModule);
-+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
-+
-+   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
-+   IRB()->SetInsertPoint(block);
-+   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
-+
-+   auto &args = pFunction->getArgumentList();
-+   Value *hPrivateData = args.begin();
-+   hPrivateData->setName("hPrivateData");
-+   Value *pPS = ++args.begin();
-+   pPS->setName("psCtx");
-+
-+   Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantFS});
-+   consts_ptr->setName("fs_constants");
-+   Value *const_sizes_ptr =
-+      GEP(hPrivateData, {0, swr_draw_context_num_constantsFS});
-+   const_sizes_ptr->setName("num_fs_constants");
-+
-+   // xxx should check for flat shading versus interpolation
-+
-+   // load i
-+   Value *vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI}, "i");
-+
-+   // load j
-+   Value *vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ}, "j");
-+
-+   // load/compute w
-+   Value *vw = FDIV(VIMMED1(1.0f), LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW}));
-+   vw->setName("w");
-+
-+   // load *pAttribs, *pPerspAttribs
-+   Value *pAttribs = LOAD(pPS, {0, SWR_PS_CONTEXT_pAttribs}, "pAttribs");
-+   Value *pPerspAttribs =
-+      LOAD(pPS, {0, SWR_PS_CONTEXT_pPerspAttribs}, "pPerspAttribs");
-+
-+   swr_fs->constantMask = 0;
-+
-+   for (int attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
-+      const unsigned mask = swr_fs->info.base.input_usage_mask[attrib];
-+      const unsigned interpMode = swr_fs->info.base.input_interpolate[attrib];
-+
-+      if (!mask)
-+         continue;
-+
-+      ubyte semantic_name = swr_fs->info.base.input_semantic_name[attrib];
-+      ubyte semantic_idx = swr_fs->info.base.input_semantic_index[attrib];
-+
-+      if (semantic_name == TGSI_SEMANTIC_FACE) {
-+         Value *ff =
-+            UI_TO_FP(LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), mFP32Ty);
-+         ff = FSUB(FMUL(ff, C(2.0f)), C(1.0f));
-+         ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vFrontFace");
-+
-+         inputs[attrib][0] = wrap(ff);
-+         inputs[attrib][1] = wrap(VIMMED1(0.0f));
-+         inputs[attrib][2] = wrap(VIMMED1(0.0f));
-+         inputs[attrib][3] = wrap(VIMMED1(1.0f));
-+         continue;
-+      } else if (semantic_name == TGSI_SEMANTIC_POSITION) { // gl_FragCoord
-+         inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX}, "vX"));
-+         inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY}, "vY"));
-+         inputs[attrib][2] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vZ}, "vZ"));
-+         inputs[attrib][3] =
-+            wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW}, "vOneOverW"));
-+         continue;
-+      } else if (semantic_name == TGSI_SEMANTIC_PRIMID) {
-+         Value *primID = LOAD(pPS, {0, SWR_PS_CONTEXT_primID}, "primID");
-+         inputs[attrib][0] = wrap(VECTOR_SPLAT(JM()->mVWidth, primID));
-+         inputs[attrib][1] = wrap(VIMMED1(0));
-+         inputs[attrib][2] = wrap(VIMMED1(0));
-+         inputs[attrib][3] = wrap(VIMMED1(0));
-+         continue;
-+      }
-+
-+      unsigned linkedAttrib =
-+         locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base);
-+      if (linkedAttrib == 0xFFFFFFFF) {
-+         // not found - check for point sprite
-+         if (ctx->rasterizer->sprite_coord_enable) {
-+            linkedAttrib = ctx->vs->info.base.num_outputs - 1;
-+         } else {
-+            fprintf(stderr,
-+                    "Missing %s[%d]\n",
-+                    tgsi_semantic_names[semantic_name],
-+                    semantic_idx);
-+            assert(0 && "attribute linkage not found");
-+         }
-+      }
-+
-+      if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
-+         swr_fs->constantMask |= 1 << linkedAttrib;
-+      }
-+
-+      for (int channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-+         if (mask & (1 << channel)) {
-+            Value *indexA = C(linkedAttrib * 12 + channel);
-+            Value *indexB = C(linkedAttrib * 12 + channel + 4);
-+            Value *indexC = C(linkedAttrib * 12 + channel + 8);
-+
-+            if ((semantic_name == TGSI_SEMANTIC_COLOR)
-+                && ctx->rasterizer->light_twoside) {
-+               unsigned bcolorAttrib = locate_linkage(
-+                  TGSI_SEMANTIC_BCOLOR, semantic_idx, &ctx->vs->info.base);
-+
-+               unsigned diff = 12 * (bcolorAttrib - linkedAttrib);
-+
-+               Value *back =
-+                  XOR(C(1), LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), "backFace");
-+
-+               Value *offset = MUL(back, C(diff));
-+               offset->setName("offset");
-+
-+               indexA = ADD(indexA, offset);
-+               indexB = ADD(indexB, offset);
-+               indexC = ADD(indexC, offset);
-+
-+               if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
-+                  swr_fs->constantMask |= 1 << bcolorAttrib;
-+               }
-+            }
-+
-+            Value *pAttribPtr = (interpMode == TGSI_INTERPOLATE_PERSPECTIVE)
-+               ? pPerspAttribs
-+               : pAttribs;
-+
-+            Value *va =
-+               VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexA)));
-+            Value *vb =
-+               VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexB)));
-+            Value *vc =
-+               VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexC)));
-+
-+            if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
-+               inputs[attrib][channel] = wrap(va);
-+            } else {
-+               Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj);
-+
-+               vc = FMUL(vk, vc);
-+
-+               Value *interp = FMUL(va, vi);
-+               Value *interp1 = FMUL(vb, vj);
-+               interp = FADD(interp, interp1);
-+               interp = FADD(interp, vc);
-+               if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE)
-+                  interp = FMUL(interp, vw);
-+               inputs[attrib][channel] = wrap(interp);
-+            }
-+         }
-+      }
-+   }
-+
-+   sampler = swr_sampler_soa_create(key.sampler);
-+
-+   struct lp_bld_tgsi_system_values system_values;
-+   memset(&system_values, 0, sizeof(system_values));
-+
-+   struct lp_build_mask_context mask;
-+
-+   if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) {
-+      Value *mask_val = LOAD(pPS, {0, SWR_PS_CONTEXT_mask}, "coverage_mask");
-+      lp_build_mask_begin(
-+         &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val));
-+   }
-+
-+   lp_build_tgsi_soa(gallivm,
-+                     swr_fs->pipe.tokens,
-+                     lp_type_float_vec(32, 32 * 8),
-+                     swr_fs->info.base.uses_kill ? &mask : NULL, // mask
-+                     wrap(consts_ptr),
-+                     wrap(const_sizes_ptr),
-+                     &system_values,
-+                     inputs,
-+                     outputs,
-+                     wrap(hPrivateData),
-+                     sampler, // sampler
-+                     &swr_fs->info.base,
-+                     NULL); // geometry shader face
-+
-+   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-+
-+   for (uint32_t attrib = 0; attrib < swr_fs->info.base.num_outputs;
-+        attrib++) {
-+      switch (swr_fs->info.base.output_semantic_name[attrib]) {
-+      case TGSI_SEMANTIC_POSITION: {
-+         // write z
-+         LLVMValueRef outZ =
-+            LLVMBuildLoad(gallivm->builder, outputs[attrib][2], "");
-+         STORE(unwrap(outZ), pPS, {0, SWR_PS_CONTEXT_vZ});
-+         break;
-+      }
-+      case TGSI_SEMANTIC_COLOR: {
-+         for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
-+            if (!outputs[attrib][channel])
-+               continue;
-+
-+            LLVMValueRef out =
-+               LLVMBuildLoad(gallivm->builder, outputs[attrib][channel], "");
-+            if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
-+               for (uint32_t rt = 0; rt < key.nr_cbufs; rt++) {
-+                  STORE(unwrap(out),
-+                        pPS,
-+                        {0, SWR_PS_CONTEXT_shaded, rt, channel});
-+               }
-+            } else {
-+               STORE(unwrap(out),
-+                     pPS,
-+                     {0,
-+                           SWR_PS_CONTEXT_shaded,
-+                           swr_fs->info.base.output_semantic_index[attrib],
-+                           channel});
-+            }
-+         }
-+         break;
-+      }
-+      default: {
-+         fprintf(stderr,
-+                 "unknown output from FS %s[%d]\n",
-+                 tgsi_semantic_names[swr_fs->info.base
-+                                        .output_semantic_name[attrib]],
-+                 swr_fs->info.base.output_semantic_index[attrib]);
-+         break;
-+      }
-+      }
-+   }
-+
-+   LLVMValueRef mask_result = 0;
-+   if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) {
-+      mask_result = lp_build_mask_end(&mask);
-+   }
-+
-+   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-+
-+   if (key.alphaTest.enabled) {
-+      unsigned linkage =
-+         locate_linkage(TGSI_SEMANTIC_COLOR, 0, &ctx->fs->info.base) + 1;
-+
-+      Value *alpha = LOAD(
-+         pPS, {0, SWR_PS_CONTEXT_shaded, linkage, 3 /* alpha */}, "alpha");
-+      Value *ref = VIMMED1(key.alphaTest.ref_value);
-+
-+      CmpInst::Predicate cmp = CmpInst::Predicate::FCMP_FALSE;
-+      switch (key.alphaTest.func) {
-+      case PIPE_FUNC_NEVER:
-+         cmp = CmpInst::Predicate::FCMP_FALSE;
-+         break;
-+      case PIPE_FUNC_LESS:
-+         cmp = CmpInst::Predicate::FCMP_OLT;
-+         break;
-+      case PIPE_FUNC_EQUAL:
-+         cmp = CmpInst::Predicate::FCMP_OEQ;
-+         break;
-+      case PIPE_FUNC_LEQUAL:
-+         cmp = CmpInst::Predicate::FCMP_OLE;
-+         break;
-+      case PIPE_FUNC_GREATER:
-+         cmp = CmpInst::Predicate::FCMP_OGT;
-+         break;
-+      case PIPE_FUNC_NOTEQUAL:
-+         cmp = CmpInst::Predicate::FCMP_ONE;
-+         break;
-+      case PIPE_FUNC_GEQUAL:
-+         cmp = CmpInst::Predicate::FCMP_OGE;
-+         break;
-+      case PIPE_FUNC_ALWAYS:
-+         cmp = CmpInst::Predicate::FCMP_TRUE;
-+         break;
-+      }
-+
-+      Value *alpha_result =
-+         IRB()->CreateFCmp(cmp, alpha, ref, "alphaTestFunc");
-+
-+      mask_result =
-+         wrap(AND(unwrap(mask_result), S_EXT(alpha_result, mSimdInt32Ty)));
-+   }
-+
-+   if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) {
-+      STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_mask});
-+   }
-+
-+   RET_VOID();
-+
-+   gallivm_verify_function(gallivm, wrap(pFunction));
-+
-+   gallivm_compile_module(gallivm);
-+
-+   PFN_PIXEL_KERNEL kernel =
-+      (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction));
-+   debug_printf("frag shader  %p\n", kernel);
-+   assert(kernel && "Error: FragShader = NULL");
-+
-+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5)
-+   JM()->mIsModuleFinalized = true;
-+#endif
-+
-+   return kernel;
-+}
-+
-+PFN_PIXEL_KERNEL
-+swr_compile_fs(struct swr_context *ctx, swr_jit_key &key)
-+{
-+   BuilderSWR builder(
-+      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr));
-+   return builder.CompileFS(ctx, key);
-+}
-diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h
-new file mode 100644
-index 0000000..2962646
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_shader.h
-@@ -0,0 +1,61 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#pragma once
-+
-+class swr_vertex_shader;
-+class swr_fragment_shader;
-+class swr_jit_key;
-+
-+PFN_VERTEX_FUNC
-+swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs);
-+
-+PFN_PIXEL_KERNEL
-+swr_compile_fs(struct swr_context *ctx, swr_jit_key &key);
-+
-+void swr_generate_fs_key(struct swr_jit_key &key,
-+                         struct swr_context *ctx,
-+                         swr_fragment_shader *swr_fs);
-+
-+struct swr_jit_key {
-+   unsigned nr_cbufs;
-+   unsigned light_twoside;
-+   ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
-+   ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS];
-+   unsigned nr_samplers;
-+   unsigned nr_sampler_views;
-+   struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS];
-+   struct pipe_alpha_state alphaTest;
-+};
-+
-+namespace std
-+{
-+template <> struct hash<swr_jit_key> {
-+   std::size_t operator()(const swr_jit_key &k) const
-+   {
-+      return util_hash_crc32(&k, sizeof(k));
-+   }
-+};
-+};
-+
-+bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs);
-diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
-new file mode 100644
-index 0000000..fa16844
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_state.cpp
-@@ -0,0 +1,1344 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#include "common/os.h"
-+#include "jit_api.h"
-+#include "JitManager.h"
-+#include "state_llvm.h"
-+
-+#include "gallivm/lp_bld_tgsi.h"
-+#include "util/u_format.h"
-+
-+#include "util/u_memory.h"
-+#include "util/u_inlines.h"
-+#include "util/u_helpers.h"
-+#include "util/u_framebuffer.h"
-+
-+#include "swr_state.h"
-+#include "swr_context.h"
-+#include "swr_context_llvm.h"
-+#include "swr_screen.h"
-+#include "swr_resource.h"
-+#include "swr_tex_sample.h"
-+#include "swr_scratch.h"
-+#include "swr_shader.h"
-+
-+/* These should be pulled out into separate files as necessary
-+ * Just initializing everything here to get going. */
-+
-+static void *
-+swr_create_blend_state(struct pipe_context *pipe,
-+                       const struct pipe_blend_state *blend)
-+{
-+   struct swr_blend_state *state = CALLOC_STRUCT(swr_blend_state);
-+
-+   memcpy(&state->pipe, blend, sizeof(*blend));
-+
-+   struct pipe_blend_state *pipe_blend = &state->pipe;
-+
-+   for (int target = 0;
-+        target < std::min(SWR_NUM_RENDERTARGETS, PIPE_MAX_COLOR_BUFS);
-+        target++) {
-+      state->compileState[target].independentAlphaBlendEnable =
-+         pipe_blend->independent_blend_enable;
-+
-+      struct pipe_rt_blend_state *rt_blend = &pipe_blend->rt[target];
-+      SWR_RENDER_TARGET_BLEND_STATE &targetState =
-+         state->compileState[target].blendState;
-+
-+      if (target != 0 && !pipe_blend->independent_blend_enable) {
-+         memcpy(&targetState, &state->compileState[0].blendState, sizeof(SWR_RENDER_TARGET_BLEND_STATE));
-+         continue;
-+      }
-+
-+      targetState.colorBlendEnable = rt_blend->blend_enable;
-+      if (targetState.colorBlendEnable) {
-+         targetState.sourceAlphaBlendFactor =
-+            swr_convert_blend_factor(rt_blend->alpha_src_factor);
-+         targetState.destAlphaBlendFactor =
-+            swr_convert_blend_factor(rt_blend->alpha_dst_factor);
-+         targetState.sourceBlendFactor =
-+            swr_convert_blend_factor(rt_blend->rgb_src_factor);
-+         targetState.destBlendFactor =
-+            swr_convert_blend_factor(rt_blend->rgb_dst_factor);
-+
-+         targetState.colorBlendFunc =
-+            swr_convert_blend_func(rt_blend->rgb_func);
-+         targetState.alphaBlendFunc =
-+            swr_convert_blend_func(rt_blend->alpha_func);
-+      }
-+
-+      targetState.writeDisableRed =
-+         (rt_blend->colormask & PIPE_MASK_R) ? 0 : 1;
-+      targetState.writeDisableGreen =
-+         (rt_blend->colormask & PIPE_MASK_G) ? 0 : 1;
-+      targetState.writeDisableBlue =
-+         (rt_blend->colormask & PIPE_MASK_B) ? 0 : 1;
-+      targetState.writeDisableAlpha =
-+         (rt_blend->colormask & PIPE_MASK_A) ? 0 : 1;
-+   }
-+
-+   return state;
-+}
-+
-+static void
-+swr_bind_blend_state(struct pipe_context *pipe, void *blend)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (ctx->blend == blend)
-+      return;
-+
-+   ctx->blend = (swr_blend_state *)blend;
-+
-+   ctx->dirty |= SWR_NEW_BLEND;
-+}
-+
-+static void
-+swr_delete_blend_state(struct pipe_context *pipe, void *blend)
-+{
-+   FREE(blend);
-+}
-+
-+static void
-+swr_set_blend_color(struct pipe_context *pipe,
-+                    const struct pipe_blend_color *color)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   ctx->blend_color = *color;
-+
-+   ctx->dirty |= SWR_NEW_BLEND;
-+}
-+
-+static void
-+swr_set_stencil_ref(struct pipe_context *pipe,
-+                    const struct pipe_stencil_ref *ref)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   ctx->stencil_ref = *ref;
-+
-+   ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA;
-+}
-+
-+static void *
-+swr_create_depth_stencil_state(
-+   struct pipe_context *pipe,
-+   const struct pipe_depth_stencil_alpha_state *depth_stencil)
-+{
-+   struct pipe_depth_stencil_alpha_state *state;
-+
-+   state = (pipe_depth_stencil_alpha_state *)mem_dup(depth_stencil,
-+                                                     sizeof *depth_stencil);
-+
-+   return state;
-+}
-+
-+static void
-+swr_bind_depth_stencil_state(struct pipe_context *pipe, void *depth_stencil)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (ctx->depth_stencil == (pipe_depth_stencil_alpha_state *)depth_stencil)
-+      return;
-+
-+   ctx->depth_stencil = (pipe_depth_stencil_alpha_state *)depth_stencil;
-+
-+   ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA;
-+}
-+
-+static void
-+swr_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
-+{
-+   FREE(depth);
-+}
-+
-+
-+static void *
-+swr_create_rasterizer_state(struct pipe_context *pipe,
-+                            const struct pipe_rasterizer_state *rast)
-+{
-+   struct pipe_rasterizer_state *state;
-+   state = (pipe_rasterizer_state *)mem_dup(rast, sizeof *rast);
-+
-+   return state;
-+}
-+
-+static void
-+swr_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   const struct pipe_rasterizer_state *rasterizer =
-+      (const struct pipe_rasterizer_state *)handle;
-+
-+   if (ctx->rasterizer == (pipe_rasterizer_state *)rasterizer)
-+      return;
-+
-+   ctx->rasterizer = (pipe_rasterizer_state *)rasterizer;
-+
-+   ctx->dirty |= SWR_NEW_RASTERIZER;
-+}
-+
-+static void
-+swr_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer)
-+{
-+   FREE(rasterizer);
-+}
-+
-+
-+static void *
-+swr_create_sampler_state(struct pipe_context *pipe,
-+                         const struct pipe_sampler_state *sampler)
-+{
-+   struct pipe_sampler_state *state =
-+      (pipe_sampler_state *)mem_dup(sampler, sizeof *sampler);
-+
-+   return state;
-+}
-+
-+static void
-+swr_bind_sampler_states(struct pipe_context *pipe,
-+                        unsigned shader,
-+                        unsigned start,
-+                        unsigned num,
-+                        void **samplers)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   unsigned i;
-+
-+   assert(shader < PIPE_SHADER_TYPES);
-+   assert(start + num <= Elements(ctx->samplers[shader]));
-+
-+   /* set the new samplers */
-+   ctx->num_samplers[shader] = num;
-+   for (i = 0; i < num; i++) {
-+      ctx->samplers[shader][start + i] = (pipe_sampler_state *)samplers[i];
-+   }
-+
-+   ctx->dirty |= SWR_NEW_SAMPLER;
-+}
-+
-+static void
-+swr_delete_sampler_state(struct pipe_context *pipe, void *sampler)
-+{
-+   FREE(sampler);
-+}
-+
-+
-+static struct pipe_sampler_view *
-+swr_create_sampler_view(struct pipe_context *pipe,
-+                        struct pipe_resource *texture,
-+                        const struct pipe_sampler_view *templ)
-+{
-+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
-+
-+   if (view) {
-+      *view = *templ;
-+      view->reference.count = 1;
-+      view->texture = NULL;
-+      pipe_resource_reference(&view->texture, texture);
-+      view->context = pipe;
-+   }
-+
-+   return view;
-+}
-+
-+static void
-+swr_set_sampler_views(struct pipe_context *pipe,
-+                      unsigned shader,
-+                      unsigned start,
-+                      unsigned num,
-+                      struct pipe_sampler_view **views)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   uint i;
-+
-+   assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
-+
-+   assert(shader < PIPE_SHADER_TYPES);
-+   assert(start + num <= Elements(ctx->sampler_views[shader]));
-+
-+   /* set the new sampler views */
-+   ctx->num_sampler_views[shader] = num;
-+   for (i = 0; i < num; i++) {
-+      /* Note: we're using pipe_sampler_view_release() here to work around
-+       * a possible crash when the old view belongs to another context that
-+       * was already destroyed.
-+       */
-+      pipe_sampler_view_release(pipe, &ctx->sampler_views[shader][start + i]);
-+      pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
-+                                  views[i]);
-+   }
-+
-+   ctx->dirty |= SWR_NEW_SAMPLER_VIEW;
-+}
-+
-+static void
-+swr_sampler_view_destroy(struct pipe_context *pipe,
-+                         struct pipe_sampler_view *view)
-+{
-+   pipe_resource_reference(&view->texture, NULL);
-+   FREE(view);
-+}
-+
-+static void *
-+swr_create_vs_state(struct pipe_context *pipe,
-+                    const struct pipe_shader_state *vs)
-+{
-+   struct swr_vertex_shader *swr_vs =
-+      (swr_vertex_shader *)CALLOC_STRUCT(swr_vertex_shader);
-+   if (!swr_vs)
-+      return NULL;
-+
-+   swr_vs->pipe.tokens = tgsi_dup_tokens(vs->tokens);
-+   swr_vs->pipe.stream_output = vs->stream_output;
-+
-+   lp_build_tgsi_info(vs->tokens, &swr_vs->info);
-+
-+   swr_vs->func = swr_compile_vs(pipe, swr_vs);
-+
-+   swr_vs->soState = {0};
-+
-+   if (swr_vs->pipe.stream_output.num_outputs) {
-+      pipe_stream_output_info *stream_output = &swr_vs->pipe.stream_output;
-+
-+      swr_vs->soState.soEnable = true;
-+      // soState.rasterizerDisable set on state dirty
-+      // soState.streamToRasterizer not used
-+
-+      for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
-+         swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
-+            1 << (stream_output->output[i].register_index - 1);
-+      }
-+      for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
-+        swr_vs->soState.streamNumEntries[i] =
-+             _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
-+       }
-+   }
-+
-+   return swr_vs;
-+}
-+
-+static void
-+swr_bind_vs_state(struct pipe_context *pipe, void *vs)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (ctx->vs == vs)
-+      return;
-+
-+   ctx->vs = (swr_vertex_shader *)vs;
-+   ctx->dirty |= SWR_NEW_VS;
-+}
-+
-+static void
-+swr_delete_vs_state(struct pipe_context *pipe, void *vs)
-+{
-+   struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs;
-+   FREE((void *)swr_vs->pipe.tokens);
-+   FREE(vs);
-+}
-+
-+static void *
-+swr_create_fs_state(struct pipe_context *pipe,
-+                    const struct pipe_shader_state *fs)
-+{
-+   struct swr_fragment_shader *swr_fs = new swr_fragment_shader;
-+   if (!swr_fs)
-+      return NULL;
-+
-+   swr_fs->pipe.tokens = tgsi_dup_tokens(fs->tokens);
-+
-+   lp_build_tgsi_info(fs->tokens, &swr_fs->info);
-+
-+   return swr_fs;
-+}
-+
-+
-+static void
-+swr_bind_fs_state(struct pipe_context *pipe, void *fs)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (ctx->fs == fs)
-+      return;
-+
-+   ctx->fs = (swr_fragment_shader *)fs;
-+   ctx->dirty |= SWR_NEW_FS;
-+}
-+
-+static void
-+swr_delete_fs_state(struct pipe_context *pipe, void *fs)
-+{
-+   struct swr_fragment_shader *swr_fs = (swr_fragment_shader *)fs;
-+   FREE((void *)swr_fs->pipe.tokens);
-+   delete swr_fs;
-+}
-+
-+
-+static void
-+swr_set_constant_buffer(struct pipe_context *pipe,
-+                        uint shader,
-+                        uint index,
-+                        struct pipe_constant_buffer *cb)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct pipe_resource *constants = cb ? cb->buffer : NULL;
-+
-+   assert(shader < PIPE_SHADER_TYPES);
-+   assert(index < Elements(ctx->constants[shader]));
-+
-+   /* note: reference counting */
-+   util_copy_constant_buffer(&ctx->constants[shader][index], cb);
-+
-+   if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) {
-+      ctx->dirty |= SWR_NEW_VSCONSTANTS;
-+   } else if (shader == PIPE_SHADER_FRAGMENT) {
-+      ctx->dirty |= SWR_NEW_FSCONSTANTS;
-+   }
-+
-+   if (cb && cb->user_buffer) {
-+      pipe_resource_reference(&constants, NULL);
-+   }
-+}
-+
-+
-+static void *
-+swr_create_vertex_elements_state(struct pipe_context *pipe,
-+                                 unsigned num_elements,
-+                                 const struct pipe_vertex_element *attribs)
-+{
-+   struct swr_vertex_element_state *velems;
-+   assert(num_elements <= PIPE_MAX_ATTRIBS);
-+   velems = CALLOC_STRUCT(swr_vertex_element_state);
-+   if (velems) {
-+      velems->fsState.numAttribs = num_elements;
-+      for (unsigned i = 0; i < num_elements; i++) {
-+         // XXX: we should do this keyed on the VS usage info
-+
-+         const struct util_format_description *desc =
-+            util_format_description(attribs[i].src_format);
-+
-+         velems->fsState.layout[i].AlignedByteOffset = attribs[i].src_offset;
-+         velems->fsState.layout[i].Format =
-+            mesa_to_swr_format(attribs[i].src_format);
-+         velems->fsState.layout[i].StreamIndex =
-+            attribs[i].vertex_buffer_index;
-+         velems->fsState.layout[i].InstanceEnable =
-+            attribs[i].instance_divisor != 0;
-+         velems->fsState.layout[i].ComponentControl0 =
-+            desc->channel[0].type != UTIL_FORMAT_TYPE_VOID
-+            ? ComponentControl::StoreSrc
-+            : ComponentControl::Store0;
-+         velems->fsState.layout[i].ComponentControl1 =
-+            desc->channel[1].type != UTIL_FORMAT_TYPE_VOID
-+            ? ComponentControl::StoreSrc
-+            : ComponentControl::Store0;
-+         velems->fsState.layout[i].ComponentControl2 =
-+            desc->channel[2].type != UTIL_FORMAT_TYPE_VOID
-+            ? ComponentControl::StoreSrc
-+            : ComponentControl::Store0;
-+         velems->fsState.layout[i].ComponentControl3 =
-+            desc->channel[3].type != UTIL_FORMAT_TYPE_VOID
-+            ? ComponentControl::StoreSrc
-+            : ComponentControl::Store1Fp;
-+         velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW;
-+         velems->fsState.layout[i].InstanceDataStepRate =
-+            attribs[i].instance_divisor;
-+
-+         /* Calculate the pitch of each stream */
-+         const SWR_FORMAT_INFO &swr_desc = GetFormatInfo(
-+            mesa_to_swr_format(attribs[i].src_format));
-+         velems->stream_pitch[attribs[i].vertex_buffer_index] += swr_desc.Bpp;
-+      }
-+   }
-+
-+   return velems;
-+}
-+
-+static void
-+swr_bind_vertex_elements_state(struct pipe_context *pipe, void *velems)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+   struct swr_vertex_element_state *swr_velems =
-+      (struct swr_vertex_element_state *)velems;
-+
-+   ctx->velems = swr_velems;
-+   ctx->dirty |= SWR_NEW_VERTEX;
-+}
-+
-+static void
-+swr_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
-+{
-+   /* XXX Need to destroy fetch shader? */
-+   FREE(velems);
-+}
-+
-+
-+static void
-+swr_set_vertex_buffers(struct pipe_context *pipe,
-+                       unsigned start_slot,
-+                       unsigned num_elements,
-+                       const struct pipe_vertex_buffer *buffers)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   assert(num_elements <= PIPE_MAX_ATTRIBS);
-+
-+   util_set_vertex_buffers_count(ctx->vertex_buffer,
-+                                 &ctx->num_vertex_buffers,
-+                                 buffers,
-+                                 start_slot,
-+                                 num_elements);
-+
-+   ctx->dirty |= SWR_NEW_VERTEX;
-+}
-+
-+
-+static void
-+swr_set_index_buffer(struct pipe_context *pipe,
-+                     const struct pipe_index_buffer *ib)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (ib)
-+      memcpy(&ctx->index_buffer, ib, sizeof(ctx->index_buffer));
-+   else
-+      memset(&ctx->index_buffer, 0, sizeof(ctx->index_buffer));
-+
-+   ctx->dirty |= SWR_NEW_VERTEX;
-+}
-+
-+static void
-+swr_set_polygon_stipple(struct pipe_context *pipe,
-+                        const struct pipe_poly_stipple *stipple)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   ctx->poly_stipple = *stipple; /* struct copy */
-+   ctx->dirty |= SWR_NEW_STIPPLE;
-+}
-+
-+static void
-+swr_set_clip_state(struct pipe_context *pipe,
-+                   const struct pipe_clip_state *clip)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   ctx->clip = *clip;
-+   /* XXX Unimplemented, but prevents crash */
-+
-+   ctx->dirty |= SWR_NEW_CLIP;
-+}
-+
-+
-+static void
-+swr_set_scissor_states(struct pipe_context *pipe,
-+                       unsigned start_slot,
-+                       unsigned num_viewports,
-+                       const struct pipe_scissor_state *scissor)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   ctx->scissor = *scissor;
-+   ctx->dirty |= SWR_NEW_SCISSOR;
-+}
-+
-+static void
-+swr_set_viewport_states(struct pipe_context *pipe,
-+                        unsigned start_slot,
-+                        unsigned num_viewports,
-+                        const struct pipe_viewport_state *vpt)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   ctx->viewport = *vpt;
-+   ctx->dirty |= SWR_NEW_VIEWPORT;
-+}
-+
-+
-+static void
-+swr_set_framebuffer_state(struct pipe_context *pipe,
-+                          const struct pipe_framebuffer_state *fb)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   boolean changed = !util_framebuffer_state_equal(&ctx->framebuffer, fb);
-+
-+   assert(fb->width <= KNOB_GUARDBAND_WIDTH);
-+   assert(fb->height <= KNOB_GUARDBAND_HEIGHT);
-+
-+   if (changed) {
-+      unsigned i;
-+      for (i = 0; i < fb->nr_cbufs; ++i)
-+         pipe_surface_reference(&ctx->framebuffer.cbufs[i], fb->cbufs[i]);
-+      for (; i < ctx->framebuffer.nr_cbufs; ++i)
-+         pipe_surface_reference(&ctx->framebuffer.cbufs[i], NULL);
-+
-+      ctx->framebuffer.nr_cbufs = fb->nr_cbufs;
-+
-+      ctx->framebuffer.width = fb->width;
-+      ctx->framebuffer.height = fb->height;
-+
-+      pipe_surface_reference(&ctx->framebuffer.zsbuf, fb->zsbuf);
-+
-+      ctx->dirty |= SWR_NEW_FRAMEBUFFER;
-+   }
-+}
-+
-+
-+static void
-+swr_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
-+{
-+   struct swr_context *ctx = swr_context(pipe);
-+
-+   if (sample_mask != ctx->sample_mask) {
-+      ctx->sample_mask = sample_mask;
-+      ctx->dirty |= SWR_NEW_RASTERIZER;
-+   }
-+}
-+
-+
-+void
-+swr_update_derived(struct swr_context *ctx,
-+                   const struct pipe_draw_info *p_draw_info)
-+{
-+   /* Any state that requires dirty flags to be re-triggered sets this mask */
-+   /* For example, user_buffer vertex and index buffers. */
-+   unsigned post_update_dirty_flags = 0;
-+
-+   /* Render Targets */
-+   if (ctx->dirty & SWR_NEW_FRAMEBUFFER) {
-+      struct pipe_framebuffer_state *fb = &ctx->framebuffer;
-+      SWR_SURFACE_STATE *new_attachment[SWR_NUM_ATTACHMENTS] = {0};
-+      boolean changed, need_idle;
-+      UINT i;
-+
-+      /* colorbuffer targets */
-+      if (fb->nr_cbufs)
-+         for (i = 0; i < fb->nr_cbufs; ++i)
-+            if (fb->cbufs[i]) {
-+               struct swr_resource *colorBuffer =
-+                  swr_resource(fb->cbufs[i]->texture);
-+               new_attachment[SWR_ATTACHMENT_COLOR0 + i] = &colorBuffer->swr;
-+            }
-+
-+      /* depth/stencil target */
-+      if (fb->zsbuf) {
-+         struct swr_resource *depthStencilBuffer =
-+            swr_resource(fb->zsbuf->texture);
-+         if (depthStencilBuffer->has_depth) {
-+            new_attachment[SWR_ATTACHMENT_DEPTH] = &depthStencilBuffer->swr;
-+
-+            if (depthStencilBuffer->has_stencil)
-+               new_attachment[SWR_ATTACHMENT_STENCIL] =
-+                  &depthStencilBuffer->secondary;
-+
-+         } else if (depthStencilBuffer->has_stencil)
-+            new_attachment[SWR_ATTACHMENT_STENCIL] = &depthStencilBuffer->swr;
-+      }
-+
-+      /* For each attachment that has changed, store tile contents to render
-+       * target */
-+      changed = FALSE;
-+      need_idle = FALSE;
-+      for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) {
-+         if ((uintptr_t)ctx->current.attachment[i]
-+             ^ (uintptr_t)new_attachment[i]) {
-+            if (ctx->current.attachment[i]) {
-+               enum SWR_TILE_STATE post_state;
-+               post_state =
-+                  (new_attachment[i] ? SWR_TILE_INVALID : SWR_TILE_RESOLVED);
-+               swr_store_render_target(ctx, i, post_state);
-+               need_idle |= TRUE;
-+            }
-+            changed |= TRUE;
-+         }
-+      }
-+
-+      /*
-+       * Attachments are live, don't update any until idle
-+       * (all StoreTiles, called by swr_store_render_targets, finish)
-+       */
-+      if (need_idle)
-+         SwrWaitForIdle(ctx->swrContext);
-+
-+      if (changed) {
-+         /* Update actual SWR core attachments, or clear those no longer
-+          * attached */
-+         swr_draw_context *pDC =
-+            (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
-+         SWR_SURFACE_STATE *renderTargets = pDC->renderTargets;
-+         for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) {
-+            if ((uintptr_t)ctx->current.attachment[i]
-+                ^ (uintptr_t)new_attachment[i]) {
-+               if (new_attachment[i]) {
-+                  renderTargets[i] = *new_attachment[i];
-+                  ctx->current.attachment[i] = new_attachment[i];
-+               } else {
-+                  renderTargets[i] = {0};
-+                  ctx->current.attachment[i] = nullptr;
-+               }
-+            }
-+         }
-+
-+         /* rendertarget changes also necessitate updating other state */
-+         ctx->dirty |= SWR_NEW_BLEND | SWR_NEW_SAMPLER_VIEW | SWR_NEW_VS
-+            | SWR_NEW_FS | SWR_NEW_RASTERIZER | SWR_NEW_VIEWPORT
-+            | SWR_NEW_DEPTH_STENCIL_ALPHA;
-+      }
-+   }
-+
-+   /* Raster state */
-+   if (ctx->dirty & (SWR_NEW_RASTERIZER | SWR_NEW_VS)) {
-+      SWR_RASTSTATE *rastState = &ctx->current.rastState;
-+      rastState->cullMode = swr_convert_cull_mode(ctx->rasterizer->cull_face);
-+      rastState->frontWinding = ctx->rasterizer->front_ccw
-+         ? SWR_FRONTWINDING_CCW
-+         : SWR_FRONTWINDING_CW;
-+      rastState->scissorEnable = ctx->rasterizer->scissor;
-+      rastState->pointSize = ctx->rasterizer->point_size > 0.0f
-+         ? ctx->rasterizer->point_size
-+         : 1.0f;
-+      rastState->lineWidth = ctx->rasterizer->line_width > 0.0f
-+         ? ctx->rasterizer->line_width
-+         : 1.0f;
-+
-+      rastState->pointParam = ctx->rasterizer->point_size_per_vertex;
-+      rastState->pointSizeAttrib = ctx->vs->pointSizeAttrib;
-+
-+      rastState->pointSpriteEnable = ctx->rasterizer->sprite_coord_enable;
-+      rastState->pointSpriteTopOrigin =
-+         ctx->rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT;
-+      rastState->pointSpriteFESlot = ctx->vs->info.base.num_outputs;
-+
-+      /* XXX TODO: Add multisample */
-+      rastState->sampleCount = SWR_MULTISAMPLE_1X;
-+
-+      bool do_offset = false;
-+      switch (ctx->rasterizer->fill_front) {
-+      case PIPE_POLYGON_MODE_FILL:
-+         do_offset = ctx->rasterizer->offset_tri;
-+         break;
-+      case PIPE_POLYGON_MODE_LINE:
-+         do_offset = ctx->rasterizer->offset_line;
-+         break;
-+      case PIPE_POLYGON_MODE_POINT:
-+         do_offset = ctx->rasterizer->offset_point;
-+         break;
-+      }
-+
-+      if (do_offset) {
-+         rastState->depthBias = ctx->rasterizer->offset_units;
-+         rastState->slopeScaledDepthBias = ctx->rasterizer->offset_scale;
-+         rastState->depthBiasClamp = ctx->rasterizer->offset_clamp;
-+      } else {
-+         rastState->depthBias = 0;
-+         rastState->slopeScaledDepthBias = 0;
-+         rastState->depthBiasClamp = 0;
-+      }
-+      struct pipe_surface *zb = ctx->framebuffer.zsbuf;
-+      if (zb && swr_resource(zb->texture)->has_depth)
-+         rastState->depthFormat = swr_resource(zb->texture)->swr.format;
-+
-+      rastState->depthClipEnable = ctx->rasterizer->depth_clip;
-+
-+      SwrSetRastState(ctx->swrContext, rastState);
-+   }
-+
-+   /* Scissor */
-+   if (ctx->dirty & SWR_NEW_SCISSOR) {
-+      BBOX bbox(ctx->scissor.miny, ctx->scissor.maxy,
-+                   ctx->scissor.minx, ctx->scissor.maxx);
-+      SwrSetScissorRects(ctx->swrContext, 1, &bbox);
-+   }
-+
-+   /* Viewport */
-+   if (ctx->dirty & SWR_NEW_VIEWPORT) {
-+      pipe_viewport_state *state = &ctx->viewport;
-+      SWR_VIEWPORT *vp = &ctx->current.vp;
-+      SWR_VIEWPORT_MATRIX *vpm = &ctx->current.vpm;
-+
-+      const float scale_x = fabs(state->scale[0]);
-+      const float scale_y = fabs(state->scale[1]);
-+      const float scale_z = fabs(state->scale[2]);
-+
-+      vp->x = state->translate[0] - scale_x;
-+      vp->width = state->translate[0] + scale_x;
-+      vp->y = state->translate[1] - scale_y;
-+      vp->height = state->translate[1] + scale_y;
-+      if (ctx->rasterizer->clip_halfz == 0) {
-+         vp->minZ = state->translate[2] - scale_z;
-+         vp->maxZ = state->translate[2] + scale_z;
-+      } else {
-+         vp->minZ = state->translate[2];
-+         vp->maxZ = state->translate[2] + scale_z;
-+      }
-+
-+      /* Flip viewport for all targets except samplable textures. */
-+      /* XXX This may not be sufficient for multiple rendertargets */
-+      struct pipe_surface *cb = ctx->framebuffer.cbufs[0];
-+      if (cb &&
-+          !(swr_resource(cb->texture)->base.bind & PIPE_BIND_SAMPLER_VIEW)) {
-+         /* Flip y and y-translate in the viewport matrix. */
-+         vpm->m00 = (vp->width - vp->x) / 2.0f;
-+         vpm->m11 = (vp->y - vp->height) / 2.0f;
-+         vpm->m22 = (vp->maxZ - vp->minZ) / 2.0f;
-+         vpm->m30 = vp->x + vpm->m00;
-+         vpm->m31 = vp->height + vpm->m11;
-+         vpm->m32 = vp->minZ + vpm->m22;
-+      } else {
-+         vpm->m00 = (vp->width - vp->x) / 2.0f;
-+         vpm->m11 = (vp->height - vp->y) / 2.0f;
-+         vpm->m22 = (vp->maxZ - vp->minZ) / 2.0f;
-+         vpm->m30 = vp->x + vpm->m00;
-+         vpm->m31 = vp->y + vpm->m11;
-+         vpm->m32 = vp->minZ + vpm->m22;
-+      }
-+
-+      /* Now that the matrix is calculated, clip the view coords to screen
-+       * size.  OpenGL allows for -ve x,y in the viewport.
-+       */
-+      vp->x = std::max(vp->x, 0.0f);
-+      vp->y = std::max(vp->y, 0.0f);
-+      vp->width = std::min(vp->width, (float)ctx->framebuffer.width);
-+      vp->height = std::min(vp->height, (float)ctx->framebuffer.height);
-+
-+      SwrSetViewports(ctx->swrContext, 1, vp, vpm);
-+   }
-+
-+   /* Set vertex & index buffers */
-+   /* (using draw info if called by swr_draw_vbo) */
-+   if (ctx->dirty & SWR_NEW_VERTEX) {
-+      uint32_t size, pitch, max_vertex, partial_inbounds;
-+      const uint8_t *p_data;
-+
-+      /* If being called by swr_draw_vbo, copy draw details */
-+      struct pipe_draw_info info = {0};
-+      if (p_draw_info)
-+         info = *p_draw_info;
-+
-+      /* vertex buffers */
-+      SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS];
-+      for (UINT i = 0; i < ctx->num_vertex_buffers; i++) {
-+         pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
-+
-+         pitch = vb->stride;
-+         if (!vb->user_buffer) {
-+            /* VBO
-+             * size is based on buffer->width0 rather than info.max_index
-+             * to prevent having to validate VBO on each draw */
-+            size = vb->buffer->width0;
-+            max_vertex = size / pitch;
-+            partial_inbounds = size % pitch;
-+
-+            p_data = (const uint8_t *)swr_resource_data(vb->buffer)
-+               + vb->buffer_offset;
-+         } else {
-+            /* Client buffer
-+             * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
-+             * revalidate on each draw */
-+            post_update_dirty_flags |= SWR_NEW_VERTEX;
-+
-+            if (pitch) {
-+               size = (info.max_index - info.min_index + 1) * pitch;
-+            } else {
-+               /* pitch = 0, means constant value
-+                * set size to 1 vertex */
-+               size = ctx->velems->stream_pitch[i];
-+            }
-+
-+            max_vertex = info.max_index + 1;
-+            partial_inbounds = 0;
-+
-+            /* Copy only needed vertices to scratch space */
-+            size = AlignUp(size, 4);
-+            const void *ptr = (const uint8_t *) vb->user_buffer
-+               + info.min_index * pitch;
-+            ptr = swr_copy_to_scratch_space(
-+               ctx, &ctx->scratch->vertex_buffer, ptr, size);
-+            p_data = (const uint8_t *)ptr - info.min_index * pitch;
-+         }
-+
-+         swrVertexBuffers[i] = {0};
-+         swrVertexBuffers[i].index = i;
-+         swrVertexBuffers[i].pitch = pitch;
-+         swrVertexBuffers[i].pData = p_data;
-+         swrVertexBuffers[i].size = size;
-+         swrVertexBuffers[i].maxVertex = max_vertex;
-+         swrVertexBuffers[i].partialInboundsSize = partial_inbounds;
-+      }
-+
-+      SwrSetVertexBuffers(
-+         ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers);
-+
-+      /* index buffer, if required (info passed in by swr_draw_vbo) */
-+      SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */
-+      if (info.indexed) {
-+         pipe_index_buffer *ib = &ctx->index_buffer;
-+
-+         pitch = ib->index_size ? ib->index_size : sizeof(uint32_t);
-+         index_type = swr_convert_index_type(pitch);
-+
-+         if (!ib->user_buffer) {
-+            /* VBO
-+             * size is based on buffer->width0 rather than info.count
-+             * to prevent having to validate VBO on each draw */
-+            size = ib->buffer->width0;
-+            p_data =
-+               (const uint8_t *)swr_resource_data(ib->buffer) + ib->offset;
-+         } else {
-+            /* Client buffer
-+             * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
-+             * revalidate on each draw */
-+            post_update_dirty_flags |= SWR_NEW_VERTEX;
-+
-+            size = info.count * pitch;
-+            size = AlignUp(size, 4);
-+
-+            /* Copy indices to scratch space */
-+            const void *ptr = ib->user_buffer;
-+            ptr = swr_copy_to_scratch_space(
-+               ctx, &ctx->scratch->index_buffer, ptr, size);
-+            p_data = (const uint8_t *)ptr;
-+         }
-+
-+         SWR_INDEX_BUFFER_STATE swrIndexBuffer;
-+         swrIndexBuffer.format = swr_convert_index_type(ib->index_size);
-+         swrIndexBuffer.pIndices = p_data;
-+         swrIndexBuffer.size = size;
-+
-+         SwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer);
-+      }
-+
-+      struct swr_vertex_element_state *velems = ctx->velems;
-+      if (velems && velems->fsState.indexType != index_type) {
-+         velems->fsFunc = NULL;
-+         velems->fsState.indexType = index_type;
-+      }
-+   }
-+
-+   /* VertexShader */
-+   if (ctx->dirty & SWR_NEW_VS) {
-+      SwrSetVertexFunc(ctx->swrContext, ctx->vs->func);
-+   }
-+
-+   swr_jit_key key;
-+   if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW
-+                     | SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_RASTERIZER
-+                     | SWR_NEW_FRAMEBUFFER)) {
-+      memset(&key, 0, sizeof(key));
-+      swr_generate_fs_key(key, ctx, ctx->fs);
-+      auto search = ctx->fs->map.find(key);
-+      PFN_PIXEL_KERNEL func;
-+      if (search != ctx->fs->map.end()) {
-+         func = search->second;
-+      } else {
-+         func = swr_compile_fs(ctx, key);
-+         ctx->fs->map.insert(std::make_pair(key, func));
-+      }
-+      SWR_PS_STATE psState = {0};
-+      psState.pfnPixelShader = func;
-+      psState.killsPixel =
-+         ctx->fs->info.base.uses_kill || key.alphaTest.enabled;
-+      psState.writesODepth = ctx->fs->info.base.writes_z;
-+      psState.usesSourceDepth = ctx->fs->info.base.reads_z;
-+      psState.maxRTSlotUsed =
-+         (ctx->framebuffer.nr_cbufs != 0) ?
-+         (ctx->framebuffer.nr_cbufs - 1) :
-+         0;
-+      SwrSetPixelShaderState(ctx->swrContext, &psState);
-+   }
-+
-+   /* JIT sampler state */
-+   if (ctx->dirty & SWR_NEW_SAMPLER) {
-+      swr_draw_context *pDC =
-+         (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
-+
-+      for (unsigned i = 0; i < key.nr_samplers; i++) {
-+         const struct pipe_sampler_state *sampler =
-+            ctx->samplers[PIPE_SHADER_FRAGMENT][i];
-+
-+         if (sampler) {
-+            pDC->samplersFS[i].min_lod = sampler->min_lod;
-+            pDC->samplersFS[i].max_lod = sampler->max_lod;
-+            pDC->samplersFS[i].lod_bias = sampler->lod_bias;
-+            COPY_4V(pDC->samplersFS[i].border_color, sampler->border_color.f);
-+         }
-+      }
-+   }
-+
-+   /* JIT sampler view state */
-+   if (ctx->dirty & SWR_NEW_SAMPLER_VIEW) {
-+      swr_draw_context *pDC =
-+         (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
-+
-+      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-+         struct pipe_sampler_view *view =
-+            ctx->sampler_views[PIPE_SHADER_FRAGMENT][i];
-+
-+         if (view) {
-+            struct pipe_resource *res = view->texture;
-+            struct swr_resource *swr_res = swr_resource(res);
-+            struct swr_jit_texture *jit_tex = &pDC->texturesFS[i];
-+            memset(jit_tex, 0, sizeof(*jit_tex));
-+            jit_tex->width = res->width0;
-+            jit_tex->height = res->height0;
-+            jit_tex->depth = res->depth0;
-+            jit_tex->first_level = view->u.tex.first_level;
-+            jit_tex->last_level = view->u.tex.last_level;
-+            jit_tex->base_ptr = swr_res->swr.pBaseAddress;
-+
-+            for (unsigned level = jit_tex->first_level;
-+                 level <= jit_tex->last_level;
-+                 level++) {
-+               jit_tex->row_stride[level] = swr_res->row_stride[level];
-+               jit_tex->img_stride[level] = swr_res->img_stride[level];
-+               jit_tex->mip_offsets[level] = swr_res->mip_offsets[level];
-+            }
-+         }
-+      }
-+   }
-+
-+   /* VertexShader Constants */
-+   if (ctx->dirty & SWR_NEW_VSCONSTANTS) {
-+      swr_draw_context *pDC =
-+         (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
-+
-+      for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-+         const pipe_constant_buffer *cb =
-+            &ctx->constants[PIPE_SHADER_VERTEX][i];
-+         pDC->num_constantsVS[i] = cb->buffer_size;
-+         if (cb->buffer)
-+            pDC->constantVS[i] =
-+               (const float *)((const BYTE *)cb->buffer + cb->buffer_offset);
-+         else {
-+            /* Need to copy these constants to scratch space */
-+            if (cb->user_buffer && cb->buffer_size) {
-+               const void *ptr =
-+                  ((const BYTE *)cb->user_buffer + cb->buffer_offset);
-+               uint32_t size = AlignUp(cb->buffer_size, 4);
-+               ptr = swr_copy_to_scratch_space(
-+                  ctx, &ctx->scratch->vs_constants, ptr, size);
-+               pDC->constantVS[i] = (const float *)ptr;
-+            }
-+         }
-+      }
-+   }
-+
-+   /* FragmentShader Constants */
-+   if (ctx->dirty & SWR_NEW_FSCONSTANTS) {
-+      swr_draw_context *pDC =
-+         (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
-+
-+      for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-+         const pipe_constant_buffer *cb =
-+            &ctx->constants[PIPE_SHADER_FRAGMENT][i];
-+         pDC->num_constantsFS[i] = cb->buffer_size;
-+         if (cb->buffer)
-+            pDC->constantFS[i] =
-+               (const float *)((const BYTE *)cb->buffer + cb->buffer_offset);
-+         else {
-+            /* Need to copy these constants to scratch space */
-+            if (cb->user_buffer && cb->buffer_size) {
-+               const void *ptr =
-+                  ((const BYTE *)cb->user_buffer + cb->buffer_offset);
-+               uint32_t size = AlignUp(cb->buffer_size, 4);
-+               ptr = swr_copy_to_scratch_space(
-+                  ctx, &ctx->scratch->fs_constants, ptr, size);
-+               pDC->constantFS[i] = (const float *)ptr;
-+            }
-+         }
-+      }
-+   }
-+
-+   /* Depth/stencil state */
-+   if (ctx->dirty & SWR_NEW_DEPTH_STENCIL_ALPHA) {
-+      struct pipe_depth_state *depth = &(ctx->depth_stencil->depth);
-+      struct pipe_stencil_state *stencil = ctx->depth_stencil->stencil;
-+      SWR_DEPTH_STENCIL_STATE depthStencilState = {{0}};
-+
-+      /* XXX, incomplete.  Need to flesh out stencil & alpha test state
-+      struct pipe_stencil_state *front_stencil =
-+      ctx->depth_stencil.stencil[0];
-+      struct pipe_stencil_state *back_stencil = ctx->depth_stencil.stencil[1];
-+      struct pipe_alpha_state alpha;
-+      */
-+      if (stencil[0].enabled) {
-+         depthStencilState.stencilWriteEnable = 1;
-+         depthStencilState.stencilTestEnable = 1;
-+         depthStencilState.stencilTestFunc =
-+            swr_convert_depth_func(stencil[0].func);
-+
-+         depthStencilState.stencilPassDepthPassOp =
-+            swr_convert_stencil_op(stencil[0].zpass_op);
-+         depthStencilState.stencilPassDepthFailOp =
-+            swr_convert_stencil_op(stencil[0].zfail_op);
-+         depthStencilState.stencilFailOp =
-+            swr_convert_stencil_op(stencil[0].fail_op);
-+         depthStencilState.stencilWriteMask = stencil[0].writemask;
-+         depthStencilState.stencilTestMask = stencil[0].valuemask;
-+         depthStencilState.stencilRefValue = ctx->stencil_ref.ref_value[0];
-+      }
-+      if (stencil[1].enabled) {
-+         depthStencilState.doubleSidedStencilTestEnable = 1;
-+
-+         depthStencilState.backfaceStencilTestFunc =
-+            swr_convert_depth_func(stencil[1].func);
-+
-+         depthStencilState.backfaceStencilPassDepthPassOp =
-+            swr_convert_stencil_op(stencil[1].zpass_op);
-+         depthStencilState.backfaceStencilPassDepthFailOp =
-+            swr_convert_stencil_op(stencil[1].zfail_op);
-+         depthStencilState.backfaceStencilFailOp =
-+            swr_convert_stencil_op(stencil[1].fail_op);
-+         depthStencilState.backfaceStencilWriteMask = stencil[1].writemask;
-+         depthStencilState.backfaceStencilTestMask = stencil[1].valuemask;
-+
-+         depthStencilState.backfaceStencilRefValue =
-+            ctx->stencil_ref.ref_value[1];
-+      }
-+
-+      depthStencilState.depthTestEnable = depth->enabled;
-+      depthStencilState.depthTestFunc = swr_convert_depth_func(depth->func);
-+      depthStencilState.depthWriteEnable = depth->writemask;
-+      SwrSetDepthStencilState(ctx->swrContext, &depthStencilState);
-+   }
-+
-+   /* Blend State */
-+   if (ctx->dirty & (SWR_NEW_BLEND | SWR_NEW_FRAMEBUFFER)) {
-+      struct pipe_framebuffer_state *fb = &ctx->framebuffer;
-+
-+      SWR_BLEND_STATE blendState;
-+      memset(&blendState, 0, sizeof(blendState));
-+      blendState.independentAlphaBlendEnable =
-+         ctx->blend->pipe.independent_blend_enable;
-+      blendState.constantColor[0] = ctx->blend_color.color[0];
-+      blendState.constantColor[1] = ctx->blend_color.color[1];
-+      blendState.constantColor[2] = ctx->blend_color.color[2];
-+      blendState.constantColor[3] = ctx->blend_color.color[3];
-+
-+      /* If there are no color buffers bound, disable writes on RT0
-+       * and skip loop */
-+      if (fb->nr_cbufs == 0) {
-+         blendState.renderTarget[0].writeDisableRed = 1;
-+         blendState.renderTarget[0].writeDisableGreen = 1;
-+         blendState.renderTarget[0].writeDisableBlue = 1;
-+         blendState.renderTarget[0].writeDisableAlpha = 1;
-+      }
-+      else
-+         for (int target = 0;
-+               target < std::min(SWR_NUM_RENDERTARGETS,
-+                                 PIPE_MAX_COLOR_BUFS);
-+               target++) {
-+            if (!fb->cbufs[target])
-+               continue;
-+
-+            BLEND_COMPILE_STATE *compileState =
-+               &ctx->blend->compileState[target];
-+
-+            struct swr_resource *colorBuffer =
-+               swr_resource(fb->cbufs[target]->texture);
-+            compileState->format = colorBuffer->swr.format;
-+
-+            memcpy(&blendState.renderTarget[target],
-+                  &compileState->blendState,
-+                  sizeof(compileState->blendState));
-+
-+            PFN_BLEND_JIT_FUNC func = NULL;
-+            auto search = ctx->blendJIT->find(*compileState);
-+            if (search != ctx->blendJIT->end()) {
-+               func = search->second;
-+            } else {
-+               HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr;
-+               func = JitCompileBlend(hJitMgr, *compileState);
-+               debug_printf("BLEND shader %p\n", func);
-+               assert(func && "Error: BlendShader = NULL");
-+
-+               ctx->blendJIT->insert(std::make_pair(*compileState, func));
-+            }
-+            SwrSetBlendFunc(ctx->swrContext, target, func);
-+         }
-+
-+      SwrSetBlendState(ctx->swrContext, &blendState);
-+   }
-+
-+   if (ctx->dirty & SWR_NEW_STIPPLE) {
-+      /* XXX What to do with this one??? SWR doesn't stipple */
-+   }
-+
-+   if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_SO | SWR_NEW_RASTERIZER)) {
-+      ctx->vs->soState.rasterizerDisable =
-+         ctx->rasterizer->rasterizer_discard;
-+      SwrSetSoState(ctx->swrContext, &ctx->vs->soState);
-+
-+      pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output;
-+
-+      for (uint32_t i = 0; i < ctx->num_so_targets; i++) {
-+         SWR_STREAMOUT_BUFFER buffer = {0};
-+         if (!ctx->so_targets[i])
-+            continue;
-+         buffer.enable = true;
-+         buffer.pBuffer =
-+            (uint32_t *)swr_resource_data(ctx->so_targets[i]->buffer);
-+         buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2;
-+         buffer.pitch = stream_output->stride[i];
-+         buffer.streamOffset = ctx->so_targets[i]->buffer_offset >> 2;
-+
-+         SwrSetSoBuffers(ctx->swrContext, &buffer, i);
-+      }
-+   }
-+
-+   uint32_t linkage = ctx->vs->linkageMask;
-+   if (ctx->rasterizer->sprite_coord_enable)
-+      linkage |= (1 << ctx->vs->info.base.num_outputs);
-+
-+   SwrSetLinkage(ctx->swrContext, linkage, NULL);
-+
-+   // set up frontend state
-+   SWR_FRONTEND_STATE feState = {0};
-+   SwrSetFrontendState(ctx->swrContext, &feState);
-+
-+   // set up backend state
-+   SWR_BACKEND_STATE backendState = {0};
-+   backendState.numAttributes = 1;
-+   backendState.numComponents[0] = 4;
-+   backendState.constantInterpolationMask = ctx->fs->constantMask;
-+   SwrSetBackendState(ctx->swrContext, &backendState);
-+
-+   ctx->dirty = post_update_dirty_flags;
-+}
-+
-+static struct pipe_stream_output_target *
-+swr_create_so_target(struct pipe_context *pipe,
-+                     struct pipe_resource *buffer,
-+                     unsigned buffer_offset,
-+                     unsigned buffer_size)
-+{
-+   struct pipe_stream_output_target *target;
-+
-+   target = CALLOC_STRUCT(pipe_stream_output_target);
-+   if (!target)
-+      return NULL;
-+
-+   target->context = pipe;
-+   target->reference.count = 1;
-+   pipe_resource_reference(&target->buffer, buffer);
-+   target->buffer_offset = buffer_offset;
-+   target->buffer_size = buffer_size;
-+   return target;
-+}
-+
-+static void
-+swr_destroy_so_target(struct pipe_context *pipe,
-+                      struct pipe_stream_output_target *target)
-+{
-+   pipe_resource_reference(&target->buffer, NULL);
-+   FREE(target);
-+}
-+
-+static void
-+swr_set_so_targets(struct pipe_context *pipe,
-+                   unsigned num_targets,
-+                   struct pipe_stream_output_target **targets,
-+                   const unsigned *offsets)
-+{
-+   struct swr_context *swr = swr_context(pipe);
-+   uint32_t i;
-+
-+   assert(num_targets < MAX_SO_STREAMS);
-+
-+   for (i = 0; i < num_targets; i++) {
-+      pipe_so_target_reference(
-+         (struct pipe_stream_output_target **)&swr->so_targets[i],
-+         targets[i]);
-+   }
-+
-+   for (/* fall-through */; i < swr->num_so_targets; i++) {
-+      pipe_so_target_reference(
-+         (struct pipe_stream_output_target **)&swr->so_targets[i], NULL);
-+   }
-+
-+   swr->num_so_targets = num_targets;
-+
-+   swr->dirty = SWR_NEW_SO;
-+}
-+
-+
-+void
-+swr_state_init(struct pipe_context *pipe)
-+{
-+   pipe->create_blend_state = swr_create_blend_state;
-+   pipe->bind_blend_state = swr_bind_blend_state;
-+   pipe->delete_blend_state = swr_delete_blend_state;
-+
-+   pipe->create_depth_stencil_alpha_state = swr_create_depth_stencil_state;
-+   pipe->bind_depth_stencil_alpha_state = swr_bind_depth_stencil_state;
-+   pipe->delete_depth_stencil_alpha_state = swr_delete_depth_stencil_state;
-+
-+   pipe->create_rasterizer_state = swr_create_rasterizer_state;
-+   pipe->bind_rasterizer_state = swr_bind_rasterizer_state;
-+   pipe->delete_rasterizer_state = swr_delete_rasterizer_state;
-+
-+   pipe->create_sampler_state = swr_create_sampler_state;
-+   pipe->bind_sampler_states = swr_bind_sampler_states;
-+   pipe->delete_sampler_state = swr_delete_sampler_state;
-+
-+   pipe->create_sampler_view = swr_create_sampler_view;
-+   pipe->set_sampler_views = swr_set_sampler_views;
-+   pipe->sampler_view_destroy = swr_sampler_view_destroy;
-+
-+   pipe->create_vs_state = swr_create_vs_state;
-+   pipe->bind_vs_state = swr_bind_vs_state;
-+   pipe->delete_vs_state = swr_delete_vs_state;
-+
-+   pipe->create_fs_state = swr_create_fs_state;
-+   pipe->bind_fs_state = swr_bind_fs_state;
-+   pipe->delete_fs_state = swr_delete_fs_state;
-+
-+   pipe->set_constant_buffer = swr_set_constant_buffer;
-+
-+   pipe->create_vertex_elements_state = swr_create_vertex_elements_state;
-+   pipe->bind_vertex_elements_state = swr_bind_vertex_elements_state;
-+   pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state;
-+
-+   pipe->set_vertex_buffers = swr_set_vertex_buffers;
-+   pipe->set_index_buffer = swr_set_index_buffer;
-+
-+   pipe->set_polygon_stipple = swr_set_polygon_stipple;
-+   pipe->set_clip_state = swr_set_clip_state;
-+   pipe->set_scissor_states = swr_set_scissor_states;
-+   pipe->set_viewport_states = swr_set_viewport_states;
-+
-+   pipe->set_framebuffer_state = swr_set_framebuffer_state;
-+
-+   pipe->set_blend_color = swr_set_blend_color;
-+   pipe->set_stencil_ref = swr_set_stencil_ref;
-+
-+   pipe->set_sample_mask = swr_set_sample_mask;
-+
-+   pipe->create_stream_output_target = swr_create_so_target;
-+   pipe->stream_output_target_destroy = swr_destroy_so_target;
-+   pipe->set_stream_output_targets = swr_set_so_targets;
-+}
-diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h
-new file mode 100644
-index 0000000..fdacd42
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_state.h
-@@ -0,0 +1,240 @@
-+/****************************************************************************
-+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the next
-+ * paragraph) shall be included in all copies or substantial portions of the
-+ * Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+ * IN THE SOFTWARE.
-+ ***************************************************************************/
-+
-+#ifndef SWR_STATE_H
-+#define SWR_STATE_H
-+
-+#include "pipe/p_defines.h"
-+#include "tgsi/tgsi_scan.h"
-+#include "tgsi/tgsi_parse.h"
-+#include "tgsi/tgsi_dump.h"
-+#include "gallivm/lp_bld_tgsi.h"
-+#include "util/u_hash.h"
-+#include "api.h"
-+#include "swr_tex_sample.h"
-+#include "swr_shader.h"
-+#include <unordered_map>
-+
-+/* skeleton */
-+struct swr_vertex_shader {
-+   struct pipe_shader_state pipe;
-+   struct lp_tgsi_info info;
-+   unsigned linkageMask;
-+   unsigned pointSizeAttrib;
-+   PFN_VERTEX_FUNC func;
-+   SWR_STREAMOUT_STATE soState;
-+   PFN_SO_FUNC soFunc[PIPE_PRIM_MAX];
-+};
-+
-+struct swr_fragment_shader {
-+   struct pipe_shader_state pipe;
-+   struct lp_tgsi_info info;
-+   unsigned constantMask;
-+   std::unordered_map<swr_jit_key, PFN_PIXEL_KERNEL> map;
-+};
-+
-+/* Vertex element state */
-+struct swr_vertex_element_state {
-+   FETCH_COMPILE_STATE fsState;
-+   PFN_FETCH_FUNC fsFunc;
-+#if 1 //BMCDEBUG
-+   uint32_t stream_pitch[PIPE_MAX_ATTRIBS];
-+#endif
-+};
-+
-+struct swr_blend_state {
-+   struct pipe_blend_state pipe;
-+   BLEND_COMPILE_STATE compileState[PIPE_MAX_COLOR_BUFS];
-+};
-+
-+/* Shadows of SWR API DrawState */
-+struct swr_shadow_state {
-+   SWR_SURFACE_STATE *attachment[SWR_NUM_ATTACHMENTS];
-+   SWR_RASTSTATE rastState;
-+   SWR_VIEWPORT vp;
-+   SWR_VIEWPORT_MATRIX vpm;
-+};
-+
-+void swr_update_derived(struct swr_context *,
-+                        const struct pipe_draw_info * = nullptr);
-+
-+/*
-+ * Conversion functions: Convert mesa state defines to SWR.
-+ */
-+
-+static INLINE SWR_STENCILOP
-+swr_convert_stencil_op(const UINT op)
-+{
-+   switch (op) {
-+   case PIPE_STENCIL_OP_KEEP:
-+      return STENCILOP_KEEP;
-+   case PIPE_STENCIL_OP_ZERO:
-+      return STENCILOP_ZERO;
-+   case PIPE_STENCIL_OP_REPLACE:
-+      return STENCILOP_REPLACE;
-+   case PIPE_STENCIL_OP_INCR:
-+      return STENCILOP_INCRSAT;
-+   case PIPE_STENCIL_OP_DECR:
-+      return STENCILOP_DECRSAT;
-+   case PIPE_STENCIL_OP_INCR_WRAP:
-+      return STENCILOP_INCR;
-+   case PIPE_STENCIL_OP_DECR_WRAP:
-+      return STENCILOP_DECR;
-+   case PIPE_STENCIL_OP_INVERT:
-+      return STENCILOP_INVERT;
-+   default:
-+      assert(0 && "Unsupported stencil op");
-+      return STENCILOP_KEEP;
-+   }
-+}
-+
-+static INLINE SWR_FORMAT
-+swr_convert_index_type(const UINT index_size)
-+{
-+   switch (index_size) {
-+   case sizeof(unsigned char):
-+      return R8_UINT;
-+   case sizeof(unsigned short):
-+      return R16_UINT;
-+   case sizeof(unsigned int):
-+      return R32_UINT;
-+   default:
-+      assert(0 && "Unsupported index type");
-+      return R32_UINT;
-+   }
-+}
-+
-+
-+static INLINE UINT
-+swr_convert_depth_func(const UINT pipe_func)
-+{
-+   switch (pipe_func) {
-+   case PIPE_FUNC_NEVER:
-+      return ZFUNC_NEVER;
-+   case PIPE_FUNC_LESS:
-+      return ZFUNC_LT;
-+   case PIPE_FUNC_EQUAL:
-+      return ZFUNC_EQ;
-+   case PIPE_FUNC_LEQUAL:
-+      return ZFUNC_LE;
-+   case PIPE_FUNC_GREATER:
-+      return ZFUNC_GT;
-+   case PIPE_FUNC_NOTEQUAL:
-+      return ZFUNC_NE;
-+   case PIPE_FUNC_GEQUAL:
-+      return ZFUNC_GE;
-+   case PIPE_FUNC_ALWAYS:
-+      return ZFUNC_ALWAYS;
-+   default:
-+      assert(0 && "Unsupported depth func");
-+      return ZFUNC_ALWAYS;
-+   }
-+}
-+
-+
-+static INLINE SWR_CULLMODE
-+swr_convert_cull_mode(const UINT cull_face)
-+{
-+   switch (cull_face) {
-+   case PIPE_FACE_NONE:
-+      return SWR_CULLMODE_NONE;
-+   case PIPE_FACE_FRONT:
-+      return SWR_CULLMODE_FRONT;
-+   case PIPE_FACE_BACK:
-+      return SWR_CULLMODE_BACK;
-+   case PIPE_FACE_FRONT_AND_BACK:
-+      return SWR_CULLMODE_BOTH;
-+   default:
-+      assert(0 && "Invalid cull mode");
-+      return SWR_CULLMODE_NONE;
-+   }
-+}
-+
-+static INLINE SWR_BLEND_OP
-+swr_convert_blend_func(const UINT blend_func)
-+{
-+   switch (blend_func) {
-+   case PIPE_BLEND_ADD:
-+      return BLENDOP_ADD;
-+   case PIPE_BLEND_SUBTRACT:
-+      return BLENDOP_SUBTRACT;
-+   case PIPE_BLEND_REVERSE_SUBTRACT:
-+      return BLENDOP_REVSUBTRACT;
-+   case PIPE_BLEND_MIN:
-+      return BLENDOP_MIN;
-+   case PIPE_BLEND_MAX:
-+      return BLENDOP_MAX;
-+   default:
-+      assert(0 && "Invalid blend func");
-+      return BLENDOP_ADD;
-+   }
-+}
-+
-+static INLINE SWR_BLEND_FACTOR
-+swr_convert_blend_factor(const UINT blend_factor)
-+{
-+   switch (blend_factor) {
-+   case PIPE_BLENDFACTOR_ONE:
-+      return BLENDFACTOR_ONE;
-+   case PIPE_BLENDFACTOR_SRC_COLOR:
-+      return BLENDFACTOR_SRC_COLOR;
-+   case PIPE_BLENDFACTOR_SRC_ALPHA:
-+      return BLENDFACTOR_SRC_ALPHA;
-+   case PIPE_BLENDFACTOR_DST_ALPHA:
-+      return BLENDFACTOR_DST_ALPHA;
-+   case PIPE_BLENDFACTOR_DST_COLOR:
-+      return BLENDFACTOR_DST_COLOR;
-+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-+      return BLENDFACTOR_SRC_ALPHA_SATURATE;
-+   case PIPE_BLENDFACTOR_CONST_COLOR:
-+      return BLENDFACTOR_CONST_COLOR;
-+   case PIPE_BLENDFACTOR_CONST_ALPHA:
-+      return BLENDFACTOR_CONST_ALPHA;
-+   case PIPE_BLENDFACTOR_SRC1_COLOR:
-+      return BLENDFACTOR_SRC1_COLOR;
-+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
-+      return BLENDFACTOR_SRC1_ALPHA;
-+   case PIPE_BLENDFACTOR_ZERO:
-+      return BLENDFACTOR_ZERO;
-+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-+      return BLENDFACTOR_INV_SRC_COLOR;
-+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-+      return BLENDFACTOR_INV_SRC_ALPHA;
-+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-+      return BLENDFACTOR_INV_DST_ALPHA;
-+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-+      return BLENDFACTOR_INV_DST_COLOR;
-+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-+      return BLENDFACTOR_INV_CONST_COLOR;
-+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-+      return BLENDFACTOR_INV_CONST_ALPHA;
-+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-+      return BLENDFACTOR_INV_SRC1_COLOR;
-+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-+      return BLENDFACTOR_INV_SRC1_ALPHA;
-+   default:
-+      assert(0 && "Invalid blend factor");
-+      return BLENDFACTOR_ONE;
-+   }
-+}
-+#endif
-diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp
-new file mode 100644
-index 0000000..8e01e32
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_tex_sample.cpp
-@@ -0,0 +1,338 @@
-+/**************************************************************************
-+ *
-+ * Copyright 2009 VMware, Inc.
-+ * All rights reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
-+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
-+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ *
-+ **************************************************************************/
-+
-+/**
-+ * Largely a copy of llvmpipe's lp_tex_sample.c
-+ */
-+
-+/**
-+ * Texture sampling code generation
-+ *
-+ * This file is nothing more than ugly glue between three largely independent
-+ * entities:
-+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
-+ * - texture sampling code generation (i.e., lp_build_sample_soa)
-+ * - SWR driver
-+ *
-+ * All interesting code is in the functions mentioned above. There is really
-+ * nothing to see here.
-+ *
-+ * @author Jose Fonseca <jfonseca@vmware.com>
-+ */
-+
-+#include "state.h"
-+#include "JitManager.h"
-+#include "state_llvm.h"
-+
-+#include "pipe/p_defines.h"
-+#include "pipe/p_shader_tokens.h"
-+#include "gallivm/lp_bld_debug.h"
-+#include "gallivm/lp_bld_const.h"
-+#include "gallivm/lp_bld_type.h"
-+#include "gallivm/lp_bld_sample.h"
-+#include "gallivm/lp_bld_tgsi.h"
-+#include "util/u_memory.h"
-+
-+#include "swr_tex_sample.h"
-+#include "swr_context_llvm.h"
-+
-+
-+/**
-+ * This provides the bridge between the sampler state store in
-+ * lp_jit_context and lp_jit_texture and the sampler code
-+ * generator. It provides the texture layout information required by
-+ * the texture sampler code generator in terms of the state stored in
-+ * lp_jit_context and lp_jit_texture in runtime.
-+ */
-+struct swr_sampler_dynamic_state {
-+   struct lp_sampler_dynamic_state base;
-+
-+   const struct swr_sampler_static_state *static_state;
-+};
-+
-+
-+/**
-+ * This is the bridge between our sampler and the TGSI translator.
-+ */
-+struct swr_sampler_soa {
-+   struct lp_build_sampler_soa base;
-+
-+   struct swr_sampler_dynamic_state dynamic_state;
-+};
-+
-+
-+/**
-+ * Fetch the specified member of the lp_jit_texture structure.
-+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
-+ *                   fetch the field's value.  Otherwise, just emit the
-+ *                   GEP code to address the field.
-+ *
-+ * @sa http://llvm.org/docs/GetElementPtr.html
-+ */
-+static LLVMValueRef
-+swr_texture_member(const struct lp_sampler_dynamic_state *base,
-+                   struct gallivm_state *gallivm,
-+                   LLVMValueRef context_ptr,
-+                   unsigned texture_unit,
-+                   unsigned member_index,
-+                   const char *member_name,
-+                   boolean emit_load)
-+{
-+   LLVMBuilderRef builder = gallivm->builder;
-+   LLVMValueRef indices[4];
-+   LLVMValueRef ptr;
-+   LLVMValueRef res;
-+
-+   assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
-+
-+   /* context[0] */
-+   indices[0] = lp_build_const_int32(gallivm, 0);
-+   /* context[0].textures */
-+   indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS);
-+   /* context[0].textures[unit] */
-+   indices[2] = lp_build_const_int32(gallivm, texture_unit);
-+   /* context[0].textures[unit].member */
-+   indices[3] = lp_build_const_int32(gallivm, member_index);
-+
-+   ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), "");
-+
-+   if (emit_load)
-+      res = LLVMBuildLoad(builder, ptr, "");
-+   else
-+      res = ptr;
-+
-+   lp_build_name(res, "context.texture%u.%s", texture_unit, member_name);
-+
-+   return res;
-+}
-+
-+
-+/**
-+ * Helper macro to instantiate the functions that generate the code to
-+ * fetch the members of lp_jit_texture to fulfill the sampler code
-+ * generator requests.
-+ *
-+ * This complexity is the price we have to pay to keep the texture
-+ * sampler code generator a reusable module without dependencies to
-+ * swr internals.
-+ */
-+#define SWR_TEXTURE_MEMBER(_name, _emit_load)                                \
-+   static LLVMValueRef swr_texture_##_name(                                  \
-+      const struct lp_sampler_dynamic_state *base,                           \
-+      struct gallivm_state *gallivm,                                         \
-+      LLVMValueRef context_ptr,                                              \
-+      unsigned texture_unit)                                                 \
-+   {                                                                         \
-+      return swr_texture_member(base,                                        \
-+                                gallivm,                                     \
-+                                context_ptr,                                 \
-+                                texture_unit,                                \
-+                                swr_jit_texture_##_name,                     \
-+                                #_name,                                      \
-+                                _emit_load);                                 \
-+   }
-+
-+
-+SWR_TEXTURE_MEMBER(width, TRUE)
-+SWR_TEXTURE_MEMBER(height, TRUE)
-+SWR_TEXTURE_MEMBER(depth, TRUE)
-+SWR_TEXTURE_MEMBER(first_level, TRUE)
-+SWR_TEXTURE_MEMBER(last_level, TRUE)
-+SWR_TEXTURE_MEMBER(base_ptr, TRUE)
-+SWR_TEXTURE_MEMBER(row_stride, FALSE)
-+SWR_TEXTURE_MEMBER(img_stride, FALSE)
-+SWR_TEXTURE_MEMBER(mip_offsets, FALSE)
-+
-+
-+/**
-+ * Fetch the specified member of the lp_jit_sampler structure.
-+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
-+ *                   fetch the field's value.  Otherwise, just emit the
-+ *                   GEP code to address the field.
-+ *
-+ * @sa http://llvm.org/docs/GetElementPtr.html
-+ */
-+static LLVMValueRef
-+swr_sampler_member(const struct lp_sampler_dynamic_state *base,
-+                   struct gallivm_state *gallivm,
-+                   LLVMValueRef context_ptr,
-+                   unsigned sampler_unit,
-+                   unsigned member_index,
-+                   const char *member_name,
-+                   boolean emit_load)
-+{
-+   LLVMBuilderRef builder = gallivm->builder;
-+   LLVMValueRef indices[4];
-+   LLVMValueRef ptr;
-+   LLVMValueRef res;
-+
-+   assert(sampler_unit < PIPE_MAX_SAMPLERS);
-+
-+   /* context[0] */
-+   indices[0] = lp_build_const_int32(gallivm, 0);
-+   /* context[0].samplers */
-+   indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS);
-+   /* context[0].samplers[unit] */
-+   indices[2] = lp_build_const_int32(gallivm, sampler_unit);
-+   /* context[0].samplers[unit].member */
-+   indices[3] = lp_build_const_int32(gallivm, member_index);
-+
-+   ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), "");
-+
-+   if (emit_load)
-+      res = LLVMBuildLoad(builder, ptr, "");
-+   else
-+      res = ptr;
-+
-+   lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name);
-+
-+   return res;
-+}
-+
-+
-+#define SWR_SAMPLER_MEMBER(_name, _emit_load)                                \
-+   static LLVMValueRef swr_sampler_##_name(                                  \
-+      const struct lp_sampler_dynamic_state *base,                           \
-+      struct gallivm_state *gallivm,                                         \
-+      LLVMValueRef context_ptr,                                              \
-+      unsigned sampler_unit)                                                 \
-+   {                                                                         \
-+      return swr_sampler_member(base,                                        \
-+                                gallivm,                                     \
-+                                context_ptr,                                 \
-+                                sampler_unit,                                \
-+                                swr_jit_sampler_##_name,                     \
-+                                #_name,                                      \
-+                                _emit_load);                                 \
-+   }
-+
-+
-+SWR_SAMPLER_MEMBER(min_lod, TRUE)
-+SWR_SAMPLER_MEMBER(max_lod, TRUE)
-+SWR_SAMPLER_MEMBER(lod_bias, TRUE)
-+SWR_SAMPLER_MEMBER(border_color, FALSE)
-+
-+
-+static void
-+swr_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
-+{
-+   FREE(sampler);
-+}
-+
-+
-+/**
-+ * Fetch filtered values from texture.
-+ * The 'texel' parameter returns four vectors corresponding to R, G, B, A.
-+ */
-+static void
-+swr_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
-+                                 struct gallivm_state *gallivm,
-+                                 const struct lp_sampler_params *params)
-+{
-+   struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base;
-+   unsigned texture_index = params->texture_index;
-+   unsigned sampler_index = params->sampler_index;
-+
-+   assert(sampler_index < PIPE_MAX_SAMPLERS);
-+   assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS);
-+
-+#if 0
-+      lp_build_sample_nop(gallivm, params->type, params->coords, params->texel);
-+#else
-+   lp_build_sample_soa(
-+      &sampler->dynamic_state.static_state[texture_index].texture_state,
-+      &sampler->dynamic_state.static_state[sampler_index].sampler_state,
-+      &sampler->dynamic_state.base,
-+      gallivm,
-+      params);
-+#endif
-+}
-+
-+/**
-+ * Fetch the texture size.
-+ */
-+static void
-+swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
-+                                struct gallivm_state *gallivm,
-+                                struct lp_type type,
-+                                unsigned texture_unit,
-+                                unsigned target,
-+                                LLVMValueRef context_ptr,
-+                                boolean is_sviewinfo,
-+                                enum lp_sampler_lod_property lod_property,
-+                                LLVMValueRef explicit_lod, /* optional */
-+                                LLVMValueRef *sizes_out)
-+{
-+   struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base;
-+
-+   assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
-+
-+   lp_build_size_query_soa(
-+      gallivm,
-+      &sampler->dynamic_state.static_state[texture_unit].texture_state,
-+      &sampler->dynamic_state.base,
-+      type,
-+      texture_unit,
-+      target,
-+      context_ptr,
-+      is_sviewinfo,
-+      lod_property,
-+      explicit_lod,
-+      sizes_out);
-+}
-+
-+
-+struct lp_build_sampler_soa *
-+swr_sampler_soa_create(const struct swr_sampler_static_state *static_state)
-+{
-+   struct swr_sampler_soa *sampler;
-+
-+   sampler = CALLOC_STRUCT(swr_sampler_soa);
-+   if (!sampler)
-+      return NULL;
-+
-+   sampler->base.destroy = swr_sampler_soa_destroy;
-+   sampler->base.emit_tex_sample = swr_sampler_soa_emit_fetch_texel;
-+   sampler->base.emit_size_query = swr_sampler_soa_emit_size_query;
-+   sampler->dynamic_state.base.width = swr_texture_width;
-+   sampler->dynamic_state.base.height = swr_texture_height;
-+   sampler->dynamic_state.base.depth = swr_texture_depth;
-+   sampler->dynamic_state.base.first_level = swr_texture_first_level;
-+   sampler->dynamic_state.base.last_level = swr_texture_last_level;
-+   sampler->dynamic_state.base.base_ptr = swr_texture_base_ptr;
-+   sampler->dynamic_state.base.row_stride = swr_texture_row_stride;
-+   sampler->dynamic_state.base.img_stride = swr_texture_img_stride;
-+   sampler->dynamic_state.base.mip_offsets = swr_texture_mip_offsets;
-+   sampler->dynamic_state.base.min_lod = swr_sampler_min_lod;
-+   sampler->dynamic_state.base.max_lod = swr_sampler_max_lod;
-+   sampler->dynamic_state.base.lod_bias = swr_sampler_lod_bias;
-+   sampler->dynamic_state.base.border_color = swr_sampler_border_color;
-+
-+   sampler->dynamic_state.static_state = static_state;
-+
-+   return &sampler->base;
-+}
-diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h
-new file mode 100644
-index 0000000..f5c368c
---- /dev/null
-+++ b/src/gallium/drivers/swr/swr_tex_sample.h
-@@ -0,0 +1,47 @@
-+/**************************************************************************
-+ *
-+ * Copyright 2007 VMware, Inc.
-+ * All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
-+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
-+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ *
-+ **************************************************************************/
-+
-+#pragma once
-+
-+#include "gallivm/lp_bld.h"
-+
-+struct swr_sampler_static_state {
-+   /*
-+    * These attributes are effectively interleaved for more sane key handling.
-+    * However, there might be lots of null space if the amount of samplers and
-+    * textures isn't the same.
-+    */
-+   struct lp_static_sampler_state sampler_state;
-+   struct lp_static_texture_state texture_state;
-+};
-+
-+/**
-+ * Pure-LLVM texture sampling code generator.
-+ *
-+ */
-+struct lp_build_sampler_soa *
-+swr_sampler_soa_create(const struct swr_sampler_static_state *key);
-diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am
-index d99caae..527d01b 100644
---- a/src/gallium/targets/libgl-xlib/Makefile.am
-+++ b/src/gallium/targets/libgl-xlib/Makefile.am
-@@ -84,4 +84,9 @@ endif
- EXTRA_lib@GL_LIB@_la_DEPENDENCIES = libgl-xlib.sym
- EXTRA_DIST = SConscript libgl-xlib.sym
- 
-+if HAVE_GALLIUM_SWR
-+lib@GL_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS)
-+AM_CPPFLAGS += -DGALLIUM_SWR
-+endif
-+
- include $(top_srcdir)/install-gallium-links.mk
-diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript
-index df5a220..da77ad5 100644
---- a/src/gallium/targets/libgl-xlib/SConscript
-+++ b/src/gallium/targets/libgl-xlib/SConscript
-@@ -46,6 +46,10 @@ if env['llvm']:
-     env.Append(CPPDEFINES = ['GALLIUM_LLVMPIPE'])
-     env.Prepend(LIBS = [llvmpipe])
- 
-+if env['llvm']:
-+    env.Append(CPPDEFINES = ['GALLIUM_SWR'])
-+    env.Prepend(LIBS = [swr])
-+
- # Disallow undefined symbols
- if env['platform'] != 'darwin':
-     env.Append(SHLINKFLAGS = ['-Wl,-z,defs'])
-diff --git a/src/gallium/targets/osmesa/Makefile.am b/src/gallium/targets/osmesa/Makefile.am
-index 38e515f..5d39486 100644
---- a/src/gallium/targets/osmesa/Makefile.am
-+++ b/src/gallium/targets/osmesa/Makefile.am
-@@ -74,6 +74,12 @@ lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS)
- lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS)
- endif
- 
-+if HAVE_GALLIUM_SWR
-+AM_CPPFLAGS += -DGALLIUM_SWR
-+lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS)
-+lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS)
-+endif
-+
- EXTRA_lib@OSMESA_LIB@_la_DEPENDENCIES = osmesa.sym
- EXTRA_DIST = \
- 	osmesa.sym \
--- 
-2.6.2
-
diff --git a/0002-swr-484541-Initial-public-SWR.patch b/0002-swr-484541-Initial-public-SWR.patch
deleted file mode 100644
index c43d9c0..0000000
--- a/0002-swr-484541-Initial-public-SWR.patch
+++ /dev/null
@@ -1,46197 +0,0 @@
-From 378e7aa8e96eb976aa4fe8cea6e522c3c2566031 Mon Sep 17 00:00:00 2001
-From: Tim Rowley <timothy.o.rowley@intel.com>
-Date: Mon, 19 Oct 2015 13:34:59 -0500
-Subject: [PATCH 2/3] swr-484541: Initial public SWR
-
----
- .../drivers/swr/rasterizer/common/containers.hpp   |  208 +
- .../drivers/swr/rasterizer/common/formats.cpp      | 5029 ++++++++++++++++++++
- .../drivers/swr/rasterizer/common/formats.h        |  222 +
- src/gallium/drivers/swr/rasterizer/common/isa.hpp  |  235 +
- src/gallium/drivers/swr/rasterizer/common/os.h     |  194 +
- .../swr/rasterizer/common/rdtsc_buckets.cpp        |  176 +
- .../drivers/swr/rasterizer/common/rdtsc_buckets.h  |  195 +
- .../swr/rasterizer/common/rdtsc_buckets_shared.h   |  167 +
- .../drivers/swr/rasterizer/common/simdintrin.h     |  792 +++
- .../drivers/swr/rasterizer/common/swr_assert.cpp   |  141 +
- .../drivers/swr/rasterizer/common/swr_assert.h     |   84 +
- src/gallium/drivers/swr/rasterizer/core/api.cpp    | 1461 ++++++
- src/gallium/drivers/swr/rasterizer/core/api.h      |  483 ++
- src/gallium/drivers/swr/rasterizer/core/arena.cpp  |  126 +
- src/gallium/drivers/swr/rasterizer/core/arena.h    |   63 +
- .../drivers/swr/rasterizer/core/backend.cpp        | 1150 +++++
- src/gallium/drivers/swr/rasterizer/core/backend.h  |   45 +
- src/gallium/drivers/swr/rasterizer/core/blend.h    |  318 ++
- src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  201 +
- src/gallium/drivers/swr/rasterizer/core/clip.h     |  851 ++++
- src/gallium/drivers/swr/rasterizer/core/context.h  |  444 ++
- .../drivers/swr/rasterizer/core/depthstencil.h     |  215 +
- src/gallium/drivers/swr/rasterizer/core/fifo.hpp   |  144 +
- .../swr/rasterizer/core/format_conversion.h        |  167 +
- .../drivers/swr/rasterizer/core/format_traits.h    | 2954 ++++++++++++
- .../drivers/swr/rasterizer/core/format_types.h     | 1053 ++++
- .../drivers/swr/rasterizer/core/frontend.cpp       | 1972 ++++++++
- src/gallium/drivers/swr/rasterizer/core/frontend.h |  326 ++
- src/gallium/drivers/swr/rasterizer/core/knobs.h    |  139 +
- .../drivers/swr/rasterizer/core/knobs_init.h       |   98 +
- .../drivers/swr/rasterizer/core/multisample.h      |  562 +++
- src/gallium/drivers/swr/rasterizer/core/pa.h       | 1205 +++++
- src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 1330 ++++++
- .../drivers/swr/rasterizer/core/rasterizer.cpp     | 1217 +++++
- .../drivers/swr/rasterizer/core/rasterizer.h       |   34 +
- .../drivers/swr/rasterizer/core/rdtsc_core.cpp     |   90 +
- .../drivers/swr/rasterizer/core/rdtsc_core.h       |  175 +
- src/gallium/drivers/swr/rasterizer/core/state.h    |  918 ++++
- .../drivers/swr/rasterizer/core/tessellator.h      |   88 +
- .../drivers/swr/rasterizer/core/threads.cpp        |  884 ++++
- src/gallium/drivers/swr/rasterizer/core/threads.h  |   62 +
- .../drivers/swr/rasterizer/core/tilemgr.cpp        |  105 +
- src/gallium/drivers/swr/rasterizer/core/tilemgr.h  |  392 ++
- src/gallium/drivers/swr/rasterizer/core/utils.cpp  |  148 +
- src/gallium/drivers/swr/rasterizer/core/utils.h    |  745 +++
- .../drivers/swr/rasterizer/jitter/JitManager.cpp   |  292 ++
- .../drivers/swr/rasterizer/jitter/JitManager.h     |  182 +
- .../drivers/swr/rasterizer/jitter/blend_jit.cpp    |  473 ++
- .../drivers/swr/rasterizer/jitter/blend_jit.h      |   49 +
- .../drivers/swr/rasterizer/jitter/builder.cpp      |   56 +
- .../drivers/swr/rasterizer/jitter/builder.h        |   66 +
- .../drivers/swr/rasterizer/jitter/builder_gen.cpp  | 1052 ++++
- .../drivers/swr/rasterizer/jitter/builder_gen.h    |  205 +
- .../drivers/swr/rasterizer/jitter/builder_math.h   |   34 +
- .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 1195 +++++
- .../drivers/swr/rasterizer/jitter/builder_misc.h   |  141 +
- .../drivers/swr/rasterizer/jitter/builder_x86.cpp  |  242 +
- .../drivers/swr/rasterizer/jitter/builder_x86.h    |   65 +
- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp    | 1450 ++++++
- .../drivers/swr/rasterizer/jitter/fetch_jit.h      |  128 +
- .../drivers/swr/rasterizer/jitter/jit_api.h        |  105 +
- .../rasterizer/jitter/scripts/gen_llvm_types.py    |  334 ++
- .../swr/rasterizer/jitter/streamout_jit.cpp        |  348 ++
- .../drivers/swr/rasterizer/jitter/streamout_jit.h  |   91 +
- .../drivers/swr/rasterizer/memory/ClearTile.cpp    |  287 ++
- .../drivers/swr/rasterizer/memory/Convert.h        |  698 +++
- .../drivers/swr/rasterizer/memory/LoadTile.cpp     |  382 ++
- .../drivers/swr/rasterizer/memory/StoreTile.cpp    | 1645 +++++++
- .../swr/rasterizer/memory/TilingFunctions.h        |  518 ++
- .../drivers/swr/rasterizer/memory/tilingtraits.h   |  239 +
- .../drivers/swr/rasterizer/scripts/gen_knobs.py    |   79 +
- .../drivers/swr/rasterizer/scripts/knob_defs.py    |  212 +
- .../swr/rasterizer/scripts/mako/__init__.py        |    8 +
- .../swr/rasterizer/scripts/mako/_ast_util.py       |  845 ++++
- .../drivers/swr/rasterizer/scripts/mako/ast.py     |  178 +
- .../drivers/swr/rasterizer/scripts/mako/cache.py   |  238 +
- .../drivers/swr/rasterizer/scripts/mako/cmd.py     |   62 +
- .../drivers/swr/rasterizer/scripts/mako/codegen.py | 1237 +++++
- .../drivers/swr/rasterizer/scripts/mako/compat.py  |  174 +
- .../swr/rasterizer/scripts/mako/exceptions.py      |  373 ++
- .../drivers/swr/rasterizer/scripts/mako/filters.py |  201 +
- .../drivers/swr/rasterizer/scripts/mako/lexer.py   |  441 ++
- .../drivers/swr/rasterizer/scripts/mako/lookup.py  |  359 ++
- .../swr/rasterizer/scripts/mako/parsetree.py       |  594 +++
- .../drivers/swr/rasterizer/scripts/mako/pygen.py   |  299 ++
- .../swr/rasterizer/scripts/mako/pyparser.py        |  232 +
- .../drivers/swr/rasterizer/scripts/mako/runtime.py |  878 ++++
- .../swr/rasterizer/scripts/mako/template.py        |  705 +++
- .../drivers/swr/rasterizer/scripts/mako/util.py    |  360 ++
- .../rasterizer/scripts/templates/knobs.template    |  106 +
- 90 files changed, 45466 insertions(+)
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/containers.hpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/formats.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/formats.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/isa.hpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/os.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdintrin.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/common/swr_assert.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/api.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/api.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/blend.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/clip.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/clip.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/context.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/depthstencil.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/fifo.hpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_conversion.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_traits.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_types.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/frontend.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/frontend.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/knobs.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/knobs_init.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/multisample.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/pa.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/state.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/tessellator.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/threads.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/threads.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/tilemgr.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/utils.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/core/utils.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/memory/Convert.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
- create mode 100644 src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/template.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/util.py
- create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
-
-diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
-new file mode 100644
-index 0000000..bc96c5f
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
-@@ -0,0 +1,208 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#ifndef SWRLIB_CONTAINERS_HPP__
-+#define SWRLIB_CONTAINERS_HPP__
-+
-+#include <functional>
-+#include "common/os.h"
-+
-+namespace SWRL
-+{
-+
-+template <typename T, int NUM_ELEMENTS>
-+struct UncheckedFixedVector
-+{
-+	UncheckedFixedVector() : mSize(0)
-+	{
-+	}
-+
-+	UncheckedFixedVector(std::size_t size, T const& exemplar)
-+	{
-+		this->mSize = 0;
-+		for (std::size_t i = 0; i < size; ++i)
-+			this->push_back(exemplar);
-+	}
-+
-+	template <typename Iter>
-+	UncheckedFixedVector(Iter fst, Iter lst)
-+	{
-+		this->mSize = 0;
-+		for ( ; fst != lst; ++fst)
-+			this->push_back(*fst);
-+	}
-+
-+	UncheckedFixedVector(UncheckedFixedVector const& UFV)
-+	{
-+		this->mSize = 0;
-+		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-+			(*this)[i] = UFV[i];
-+		this->mSize = UFV.size();
-+	}
-+
-+	UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
-+	{
-+		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-+			(*this)[i] = UFV[i];
-+		this->mSize = UFV.size();
-+		return *this;
-+	}
-+
-+	T* begin()	{ return &this->mElements[0]; }
-+	T* end()	{ return &this->mElements[0] + this->mSize; }
-+	T const* begin() const	{ return &this->mElements[0]; }
-+	T const* end() const	{ return &this->mElements[0] + this->mSize; }
-+
-+	friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-+	{
-+		if (L.size() != R.size()) return false;
-+		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-+		{
-+			if (L[i] != R[i]) return false;
-+		}
-+		return true;
-+	}
-+
-+	friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-+	{
-+		if (L.size() != R.size()) return true;
-+		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-+		{
-+			if (L[i] != R[i]) return true;
-+		}
-+		return false;
-+	}
-+
-+	T& operator[](std::size_t idx)
-+	{
-+		return this->mElements[idx];
-+	}
-+	T const& operator[](std::size_t idx) const
-+	{
-+		return this->mElements[idx];
-+	}
-+	void push_back(T const& t)
-+	{
-+		this->mElements[this->mSize]	= t;
-+		++this->mSize;
-+	}
-+	void pop_back()
-+	{
-+		SWR_ASSERT(this->mSize > 0);
-+		--this->mSize;
-+	}
-+	T& back()
-+	{
-+		return this->mElements[this->mSize-1];
-+	}
-+	T const& back() const
-+	{
-+		return this->mElements[this->mSize-1];
-+	}
-+	bool empty() const
-+	{
-+		return this->mSize == 0;
-+	}
-+	std::size_t size() const
-+	{
-+		return this->mSize;
-+	}
-+	void resize(std::size_t sz)
-+	{
-+		this->mSize = sz;
-+	}
-+	void clear()
-+	{
-+		this->resize(0);
-+	}
-+private:
-+	std::size_t	mSize;
-+	T			mElements[NUM_ELEMENTS];
-+};
-+
-+template <typename T, int NUM_ELEMENTS>
-+struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
-+{
-+	FixedStack() {}
-+
-+	void push(T const& t)
-+	{
-+		this->push_back(t);
-+	}
-+
-+	void pop()
-+	{
-+		this->pop_back();
-+	}
-+
-+	T& top()
-+	{
-+		return this->back();
-+	}
-+
-+	T const& top() const
-+	{
-+		return this->back();
-+	}
-+};
-+
-+template <typename T>
-+struct CRCHash
-+{
-+    static_assert((sizeof(T) % sizeof(UINT)) == 0, "CRCHash expects templated type size is even multiple of 4B");
-+    UINT operator()(const T& k) const
-+    {
-+        UINT *pData = (UINT*)&k;
-+        UINT crc = 0;
-+        for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i)
-+        {
-+            crc = _mm_crc32_u32(crc, pData[i]);
-+        }
-+        return crc;
-+    }
-+};
-+
-+}// end SWRL
-+
-+namespace std
-+{
-+
-+template <typename T, int N>
-+struct hash<SWRL::UncheckedFixedVector<T, N>>
-+{
-+	size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
-+	{
-+		if (v.size() == 0) return 0;
-+		std::hash<T> H;
-+		size_t x = H(v[0]);
-+		if (v.size() == 1) return x;
-+		for (size_t i = 1; i < v.size(); ++i)
-+			x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
-+		return x;
-+	}
-+};
-+
-+
-+}// end std.
-+
-+#endif//SWRLIB_CONTAINERS_HPP__
-diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
-new file mode 100644
-index 0000000..7e90ee7
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
-@@ -0,0 +1,5029 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file formats.cpp
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#include "formats.h"
-+
-+// lookup table for unorm8 srgb -> float conversion
-+const uint32_t srgb8Table[256] = {
-+    0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd,
-+    0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, 0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1,
-+    0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431, 0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9,
-+    0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f,
-+    0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66, 0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb,
-+    0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092, 0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19,
-+    0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379, 0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3,
-+    0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706, 0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307,
-+    0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283,
-+    0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333,
-+    0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17, 0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54,
-+    0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8,
-+    0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540, 0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1,
-+    0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681, 0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7,
-+    0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203, 0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2,
-+    0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2, 0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000,
-+};
-+
-+// order must match SWR_FORMAT
-+const SWR_FORMAT_INFO gFormatInfo[] = {
-+    // R32G32B32A32_FLOAT (0x0)
-+    {
-+        "R32G32B32A32_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 32, 32, 32, 32 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32B32A32_SINT (0x1)
-+    {
-+        "R32G32B32A32_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 32, 32, 32, 32 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32B32A32_UINT (0x2)
-+    {
-+        "R32G32B32A32_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 32, 32, 32, 32 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x3 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x4 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x5 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R32G32B32X32_FLOAT (0x6)
-+    {
-+        "R32G32B32X32_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 32, 32, 32, 32 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32B32A32_SSCALED (0x7)
-+    {
-+        "R32G32B32A32_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 32, 32, 32, 32 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32B32A32_USCALED (0x8)
-+    {
-+        "R32G32B32A32_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 32, 32, 32, 32 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x9 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xc (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xd (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xf (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x10 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x11 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x12 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x13 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x14 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x15 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x16 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x17 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x18 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x19 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x20 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x21 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x22 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x23 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x24 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x25 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x26 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x27 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x28 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x29 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x2a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x2b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x2c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x2d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x2e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x2f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x30 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x31 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x32 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x33 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x34 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x35 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x36 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x37 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x38 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x39 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x3a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x3b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x3c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x3d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x3e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x3f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R32G32B32_FLOAT (0x40)
-+    {
-+        "R32G32B32_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 32, 32, 32, 0 }, // Bits per component
-+        96, // Bits per element
-+        12, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32B32_SINT (0x41)
-+    {
-+        "R32G32B32_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 32, 32, 32, 0 }, // Bits per component
-+        96, // Bits per element
-+        12, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32B32_UINT (0x42)
-+    {
-+        "R32G32B32_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 32, 32, 32, 0 }, // Bits per component
-+        96, // Bits per element
-+        12, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x43 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x44 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R32G32B32_SSCALED (0x45)
-+    {
-+        "R32G32B32_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 32, 32, 32, 0 }, // Bits per component
-+        96, // Bits per element
-+        12, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32B32_USCALED (0x46)
-+    {
-+        "R32G32B32_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 32, 32, 32, 0 }, // Bits per component
-+        96, // Bits per element
-+        12, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x47 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x48 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x49 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x4a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x4b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x4c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x4d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x4e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x4f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x50 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x51 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x52 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x53 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x54 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x55 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x56 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x57 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x58 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x59 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x5a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x5b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x5c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x5d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x5e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x5f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x60 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x61 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x62 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x63 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x64 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x65 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x66 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x67 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x68 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x69 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x6a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x6b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x6c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x6d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x6e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x6f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x70 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x71 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x72 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x73 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x74 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x75 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x76 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x77 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x78 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x79 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x7a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x7b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x7c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x7d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x7e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x7f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R16G16B16A16_UNORM (0x80)
-+    {
-+        "R16G16B16A16_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16A16_SNORM (0x81)
-+    {
-+        "R16G16B16A16_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16A16_SINT (0x82)
-+    {
-+        "R16G16B16A16_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16A16_UINT (0x83)
-+    {
-+        "R16G16B16A16_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16A16_FLOAT (0x84)
-+    {
-+        "R16G16B16A16_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32_FLOAT (0x85)
-+    {
-+        "R32G32_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 32, 32, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32_SINT (0x86)
-+    {
-+        "R32G32_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 32, 32, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32_UINT (0x87)
-+    {
-+        "R32G32_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 32, 32, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32_FLOAT_X8X24_TYPELESS (0x88)
-+    {
-+        "R32_FLOAT_X8X24_TYPELESS",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 32, 32, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x89 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x8a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x8b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x8c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x8d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R16G16B16X16_UNORM (0x8e)
-+    {
-+        "R16G16B16X16_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16X16_FLOAT (0x8f)
-+    {
-+        "R16G16B16X16_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x90 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x91 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x92 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R16G16B16A16_SSCALED (0x93)
-+    {
-+        "R16G16B16A16_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16A16_USCALED (0x94)
-+    {
-+        "R16G16B16A16_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 16, 16, 16, 16 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32_SSCALED (0x95)
-+    {
-+        "R32G32_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 32, 32, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32G32_USCALED (0x96)
-+    {
-+        "R32G32_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 32, 32, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x97 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R32_FLOAT_X8X24_TYPELESS_LD (0x98)
-+    {
-+        "R32_FLOAT_X8X24_TYPELESS_LD",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 32, 32, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x99 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x9a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x9b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x9c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x9d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x9e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x9f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa0 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa1 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa2 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa3 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa4 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa5 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa6 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa7 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa8 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xa9 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xaa (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xab (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xac (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xad (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xae (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xaf (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb0 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb1 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb2 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb3 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb4 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb5 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb6 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb7 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb8 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xb9 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xba (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xbb (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xbc (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xbd (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xbe (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xbf (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // B8G8R8A8_UNORM (0xc0)
-+    {
-+        "B8G8R8A8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B8G8R8A8_UNORM_SRGB (0xc1)
-+    {
-+        "B8G8R8A8_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R10G10B10A2_UNORM (0xc2)
-+    {
-+        "R10G10B10A2_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R10G10B10A2_UNORM_SRGB (0xc3)
-+    {
-+        "R10G10B10A2_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R10G10B10A2_UINT (0xc4)
-+    {
-+        "R10G10B10A2_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0xc5 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xc6 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R8G8B8A8_UNORM (0xc7)
-+    {
-+        "R8G8B8A8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8A8_UNORM_SRGB (0xc8)
-+    {
-+        "R8G8B8A8_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8A8_SNORM (0xc9)
-+    {
-+        "R8G8B8A8_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8A8_SINT (0xca)
-+    {
-+        "R8G8B8A8_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8A8_UINT (0xcb)
-+    {
-+        "R8G8B8A8_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16_UNORM (0xcc)
-+    {
-+        "R16G16_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 16, 16, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16_SNORM (0xcd)
-+    {
-+        "R16G16_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 16, 16, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16_SINT (0xce)
-+    {
-+        "R16G16_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 16, 16, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16_UINT (0xcf)
-+    {
-+        "R16G16_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 16, 16, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16_FLOAT (0xd0)
-+    {
-+        "R16G16_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 16, 16, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10A2_UNORM (0xd1)
-+    {
-+        "B10G10R10A2_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10A2_UNORM_SRGB (0xd2)
-+    {
-+        "B10G10R10A2_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R11G11B10_FLOAT (0xd3)
-+    {
-+        "R11G11B10_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 11, 11, 10, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0xd4 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xd5 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R32_SINT (0xd6)
-+    {
-+        "R32_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 32, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32_UINT (0xd7)
-+    {
-+        "R32_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 32, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32_FLOAT (0xd8)
-+    {
-+        "R32_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 32, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R24_UNORM_X8_TYPELESS (0xd9)
-+    {
-+        "R24_UNORM_X8_TYPELESS",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 24, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0xda (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xdb (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R24_UNORM_X8_TYPELESS_LD (0xdc)
-+    {
-+        "R24_UNORM_X8_TYPELESS_LD",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 24, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0xdd (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xde (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xdf (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe0 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe1 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe2 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe3 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe4 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // A32_FLOAT (0xe5)
-+    {
-+        "A32_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 3, 0, 0, 0 }, // Swizzle
-+        { 32, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0xe6 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe7 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xe8 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // B8G8R8X8_UNORM (0xe9)
-+    {
-+        "B8G8R8X8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B8G8R8X8_UNORM_SRGB (0xea)
-+    {
-+        "B8G8R8X8_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8X8_UNORM (0xeb)
-+    {
-+        "R8G8B8X8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8X8_UNORM_SRGB (0xec)
-+    {
-+        "R8G8B8X8_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R9G9B9E5_SHAREDEXP (0xed)
-+    {
-+        "R9G9B9E5_SHAREDEXP",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 9, 9, 9, 5 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10X2_UNORM (0xee)
-+    {
-+        "B10G10R10X2_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0xef (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xf0 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xf1 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xf2 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R10G10B10X2_USCALED (0xf3)
-+    {
-+        "R10G10B10X2_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8A8_SSCALED (0xf4)
-+    {
-+        "R8G8B8A8_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8A8_USCALED (0xf5)
-+    {
-+        "R8G8B8A8_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16_SSCALED (0xf6)
-+    {
-+        "R16G16_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 16, 16, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16_USCALED (0xf7)
-+    {
-+        "R16G16_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 16, 16, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32_SSCALED (0xf8)
-+    {
-+        "R32_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 32, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R32_USCALED (0xf9)
-+    {
-+        "R32_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 32, 0, 0, 0 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0xfa (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xfb (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xfc (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xfd (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xfe (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0xff (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // B5G6R5_UNORM (0x100)
-+    {
-+        "B5G6R5_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 0 }, // Swizzle
-+        { 5, 6, 5, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B5G6R5_UNORM_SRGB (0x101)
-+    {
-+        "B5G6R5_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 0 }, // Swizzle
-+        { 5, 6, 5, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        3, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B5G5R5A1_UNORM (0x102)
-+    {
-+        "B5G5R5A1_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 5, 5, 5, 1 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B5G5R5A1_UNORM_SRGB (0x103)
-+    {
-+        "B5G5R5A1_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 5, 5, 5, 1 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B4G4R4A4_UNORM (0x104)
-+    {
-+        "B4G4R4A4_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 4, 4, 4, 4 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B4G4R4A4_UNORM_SRGB (0x105)
-+    {
-+        "B4G4R4A4_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 4, 4, 4, 4 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8_UNORM (0x106)
-+    {
-+        "R8G8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 8, 8, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8_SNORM (0x107)
-+    {
-+        "R8G8_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 8, 8, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8_SINT (0x108)
-+    {
-+        "R8G8_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 8, 8, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8_UINT (0x109)
-+    {
-+        "R8G8_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 8, 8, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16_UNORM (0x10a)
-+    {
-+        "R16_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16_SNORM (0x10b)
-+    {
-+        "R16_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16_SINT (0x10c)
-+    {
-+        "R16_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16_UINT (0x10d)
-+    {
-+        "R16_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16_FLOAT (0x10e)
-+    {
-+        "R16_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x10f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x110 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x111 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x112 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // A16_UNORM (0x113)
-+    {
-+        "A16_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 3, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x114 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x115 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x116 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // A16_FLOAT (0x117)
-+    {
-+        "A16_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 3, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x118 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x119 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // B5G5R5X1_UNORM (0x11a)
-+    {
-+        "B5G5R5X1_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 5, 5, 5, 1 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B5G5R5X1_UNORM_SRGB (0x11b)
-+    {
-+        "B5G5R5X1_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 5, 5, 5, 1 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        4, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8_SSCALED (0x11c)
-+    {
-+        "R8G8_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 8, 8, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8_USCALED (0x11d)
-+    {
-+        "R8G8_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 0, 0 }, // Swizzle
-+        { 8, 8, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        2, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16_SSCALED (0x11e)
-+    {
-+        "R16_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16_USCALED (0x11f)
-+    {
-+        "R16_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 16, 0, 0, 0 }, // Bits per component
-+        16, // Bits per element
-+        2, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x120 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x121 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x122 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x123 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x124 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x125 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x126 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x127 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x128 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x129 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x12a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x12b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x12c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x12d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x12e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x12f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x130 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x131 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x132 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x133 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x134 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x135 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x136 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x137 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x138 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x139 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x13a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x13b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x13c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x13d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x13e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x13f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R8_UNORM (0x140)
-+    {
-+        "R8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        8, // Bits per element
-+        1, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8_SNORM (0x141)
-+    {
-+        "R8_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        8, // Bits per element
-+        1, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8_SINT (0x142)
-+    {
-+        "R8_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        8, // Bits per element
-+        1, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8_UINT (0x143)
-+    {
-+        "R8_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        8, // Bits per element
-+        1, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // A8_UNORM (0x144)
-+    {
-+        "A8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 3, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        8, // Bits per element
-+        1, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x145 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x146 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x147 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x148 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R8_SSCALED (0x149)
-+    {
-+        "R8_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        8, // Bits per element
-+        1, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8_USCALED (0x14a)
-+    {
-+        "R8_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        8, // Bits per element
-+        1, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 0, 0, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x14b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x14c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x14d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x14e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x14f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x150 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x151 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x152 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x153 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x154 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x155 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x156 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x157 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x158 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x159 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x15a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x15b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x15c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x15d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x15e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x15f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x160 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x161 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x162 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x163 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x164 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x165 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x166 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x167 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x168 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x169 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x16a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x16b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x16c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x16d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x16e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x16f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x170 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x171 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x172 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x173 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x174 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x175 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x176 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x177 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x178 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x179 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x17a (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x17b (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x17c (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x17d (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x17e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x17f (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x180 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x181 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x182 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // YCRCB_SWAPUVY (0x183)
-+    {
-+        "YCRCB_SWAPUVY",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        true, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        2, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x184 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x185 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // BC1_UNORM (0x186)
-+    {
-+        "BC1_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC2_UNORM (0x187)
-+    {
-+        "BC2_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC3_UNORM (0x188)
-+    {
-+        "BC3_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC4_UNORM (0x189)
-+    {
-+        "BC4_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC5_UNORM (0x18a)
-+    {
-+        "BC5_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC1_UNORM_SRGB (0x18b)
-+    {
-+        "BC1_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        1, // Num components
-+        true, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC2_UNORM_SRGB (0x18c)
-+    {
-+        "BC2_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        true, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC3_UNORM_SRGB (0x18d)
-+    {
-+        "BC3_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        true, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // 0x18e (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // YCRCB_SWAPUV (0x18f)
-+    {
-+        "YCRCB_SWAPUV",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 8, 8, 8, 8 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        true, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        2, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x190 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x191 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x192 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R8G8B8_UNORM (0x193)
-+    {
-+        "R8G8B8_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 8, 8, 8, 0 }, // Bits per component
-+        24, // Bits per element
-+        3, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8_SNORM (0x194)
-+    {
-+        "R8G8B8_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 8, 8, 8, 0 }, // Bits per component
-+        24, // Bits per element
-+        3, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8_SSCALED (0x195)
-+    {
-+        "R8G8B8_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 8, 8, 8, 0 }, // Bits per component
-+        24, // Bits per element
-+        3, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8_USCALED (0x196)
-+    {
-+        "R8G8B8_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 8, 8, 8, 0 }, // Bits per component
-+        24, // Bits per element
-+        3, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x197 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x198 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // BC4_SNORM (0x199)
-+    {
-+        "BC4_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        64, // Bits per element
-+        8, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC5_SNORM (0x19a)
-+    {
-+        "BC5_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // R16G16B16_FLOAT (0x19b)
-+    {
-+        "R16G16B16_FLOAT",
-+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 16, 16, 16, 0 }, // Bits per component
-+        48, // Bits per element
-+        6, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16_UNORM (0x19c)
-+    {
-+        "R16G16B16_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 16, 16, 16, 0 }, // Bits per component
-+        48, // Bits per element
-+        6, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16_SNORM (0x19d)
-+    {
-+        "R16G16B16_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 16, 16, 16, 0 }, // Bits per component
-+        48, // Bits per element
-+        6, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16_SSCALED (0x19e)
-+    {
-+        "R16G16B16_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 16, 16, 16, 0 }, // Bits per component
-+        48, // Bits per element
-+        6, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16_USCALED (0x19f)
-+    {
-+        "R16G16B16_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 16, 16, 16, 0 }, // Bits per component
-+        48, // Bits per element
-+        6, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x1a0 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1a1 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // BC7_UNORM (0x1a2)
-+    {
-+        "BC7_UNORM",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        false, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // BC7_UNORM_SRGB (0x1a3)
-+    {
-+        "BC7_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 0, 0, 0 }, // Swizzle
-+        { 8, 0, 0, 0 }, // Bits per component
-+        128, // Bits per element
-+        16, // Bytes per element
-+        1, // Num components
-+        true, // isSRGB
-+        true, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-+        4, // bcWidth
-+        4, // bcHeight
-+    },
-+    // 0x1a4 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1a5 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1a6 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1a7 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R8G8B8_UNORM_SRGB (0x1a8)
-+    {
-+        "R8G8B8_UNORM_SRGB",
-+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 8, 8, 8, 0 }, // Bits per component
-+        24, // Bits per element
-+        3, // Bytes per element
-+        3, // Num components
-+        true, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x1a9 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1aa (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1ab (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1ac (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1ad (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1ae (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1af (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R16G16B16_UINT (0x1b0)
-+    {
-+        "R16G16B16_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 16, 16, 16, 0 }, // Bits per component
-+        48, // Bits per element
-+        6, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R16G16B16_SINT (0x1b1)
-+    {
-+        "R16G16B16_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 16, 16, 16, 0 }, // Bits per component
-+        48, // Bits per element
-+        6, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x1b2 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R10G10B10A2_SNORM (0x1b3)
-+    {
-+        "R10G10B10A2_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R10G10B10A2_USCALED (0x1b4)
-+    {
-+        "R10G10B10A2_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R10G10B10A2_SSCALED (0x1b5)
-+    {
-+        "R10G10B10A2_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R10G10B10A2_SINT (0x1b6)
-+    {
-+        "R10G10B10A2_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10A2_SNORM (0x1b7)
-+    {
-+        "B10G10R10A2_SNORM",
-+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { true, true, true, true }, // Is normalized?
-+        { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10A2_USCALED (0x1b8)
-+    {
-+        "B10G10R10A2_USCALED",
-+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10A2_SSCALED (0x1b9)
-+    {
-+        "B10G10R10A2_SSCALED",
-+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10A2_UINT (0x1ba)
-+    {
-+        "B10G10R10A2_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // B10G10R10A2_SINT (0x1bb)
-+    {
-+        "B10G10R10A2_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 2, 1, 0, 3 }, // Swizzle
-+        { 10, 10, 10, 2 }, // Bits per component
-+        32, // Bits per element
-+        4, // Bytes per element
-+        4, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // 0x1bc (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1bd (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1be (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1bf (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c0 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c1 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c2 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c3 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c4 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c5 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c6 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // 0x1c7 (Padding)
-+    {
-+        "UNKNOWN",
-+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
-+        { false, false, false, false },
-+        { 0.0f, 0.0f, 0.0f, 0.0f },
-+        1, 1,    },
-+    // R8G8B8_UINT (0x1c8)
-+    {
-+        "R8G8B8_UINT",
-+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 8, 8, 8, 0 }, // Bits per component
-+        24, // Bits per element
-+        3, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+    // R8G8B8_SINT (0x1c9)
-+    {
-+        "R8G8B8_SINT",
-+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
-+        { 0, 0, 0, 0x1 }, // Defaults for missing components
-+        { 0, 1, 2, 0 }, // Swizzle
-+        { 8, 8, 8, 0 }, // Bits per component
-+        24, // Bits per element
-+        3, // Bytes per element
-+        3, // Num components
-+        false, // isSRGB
-+        false, // isBC
-+        false, // isSubsampled
-+        { false, false, false, false }, // Is normalized?
-+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-+        1, // bcWidth
-+        1, // bcHeight
-+    },
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h
-new file mode 100644
-index 0000000..ff1fdb2
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/formats.h
-@@ -0,0 +1,222 @@
-+
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file formats.h
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#pragma once
-+
-+#include "common/os.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_TYPE - Format component type
-+//////////////////////////////////////////////////////////////////////////
-+enum SWR_TYPE
-+{
-+    SWR_TYPE_UNKNOWN,
-+    SWR_TYPE_UNUSED,
-+    SWR_TYPE_UNORM,
-+    SWR_TYPE_SNORM,
-+    SWR_TYPE_UINT,
-+    SWR_TYPE_SINT,
-+    SWR_TYPE_FLOAT,
-+    SWR_TYPE_SSCALED,
-+    SWR_TYPE_USCALED,
-+};
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_FORMAT
-+//////////////////////////////////////////////////////////////////////////
-+enum SWR_FORMAT
-+{
-+    R32G32B32A32_FLOAT = 0x0,
-+    R32G32B32A32_SINT = 0x1,
-+    R32G32B32A32_UINT = 0x2,
-+    R32G32B32X32_FLOAT = 0x6,
-+    R32G32B32A32_SSCALED = 0x7,
-+    R32G32B32A32_USCALED = 0x8,
-+    R32G32B32_FLOAT = 0x40,
-+    R32G32B32_SINT = 0x41,
-+    R32G32B32_UINT = 0x42,
-+    R32G32B32_SSCALED = 0x45,
-+    R32G32B32_USCALED = 0x46,
-+    R16G16B16A16_UNORM = 0x80,
-+    R16G16B16A16_SNORM = 0x81,
-+    R16G16B16A16_SINT = 0x82,
-+    R16G16B16A16_UINT = 0x83,
-+    R16G16B16A16_FLOAT = 0x84,
-+    R32G32_FLOAT = 0x85,
-+    R32G32_SINT = 0x86,
-+    R32G32_UINT = 0x87,
-+    R32_FLOAT_X8X24_TYPELESS = 0x88,
-+    R16G16B16X16_UNORM = 0x8E,
-+    R16G16B16X16_FLOAT = 0x8F,
-+    R16G16B16A16_SSCALED = 0x93,
-+    R16G16B16A16_USCALED = 0x94,
-+    R32G32_SSCALED = 0x95,
-+    R32G32_USCALED = 0x96,
-+    R32_FLOAT_X8X24_TYPELESS_LD = 0x98,
-+    B8G8R8A8_UNORM = 0xC0,
-+    B8G8R8A8_UNORM_SRGB = 0xC1,
-+    R10G10B10A2_UNORM = 0xC2,
-+    R10G10B10A2_UNORM_SRGB = 0xC3,
-+    R10G10B10A2_UINT = 0xC4,
-+    R8G8B8A8_UNORM = 0xC7,
-+    R8G8B8A8_UNORM_SRGB = 0xC8,
-+    R8G8B8A8_SNORM = 0xC9,
-+    R8G8B8A8_SINT = 0xCA,
-+    R8G8B8A8_UINT = 0xCB,
-+    R16G16_UNORM = 0xCC,
-+    R16G16_SNORM = 0xCD,
-+    R16G16_SINT = 0xCE,
-+    R16G16_UINT = 0xCF,
-+    R16G16_FLOAT = 0xD0,
-+    B10G10R10A2_UNORM = 0xD1,
-+    B10G10R10A2_UNORM_SRGB = 0xD2,
-+    R11G11B10_FLOAT = 0xD3,
-+    R32_SINT = 0xD6,
-+    R32_UINT = 0xD7,
-+    R32_FLOAT = 0xD8,
-+    R24_UNORM_X8_TYPELESS = 0xD9,
-+    R24_UNORM_X8_TYPELESS_LD = 0xDC,
-+    A32_FLOAT = 0xE5,
-+    B8G8R8X8_UNORM = 0xE9,
-+    B8G8R8X8_UNORM_SRGB = 0xEA,
-+    R8G8B8X8_UNORM = 0xEB,
-+    R8G8B8X8_UNORM_SRGB = 0xEC,
-+    R9G9B9E5_SHAREDEXP = 0xED,
-+    B10G10R10X2_UNORM = 0xEE,
-+    R10G10B10X2_USCALED = 0xF3,
-+    R8G8B8A8_SSCALED = 0xF4,
-+    R8G8B8A8_USCALED = 0xF5,
-+    R16G16_SSCALED = 0xF6,
-+    R16G16_USCALED = 0xF7,
-+    R32_SSCALED = 0xF8,
-+    R32_USCALED = 0xF9,
-+    B5G6R5_UNORM = 0x100,
-+    B5G6R5_UNORM_SRGB = 0x101,
-+    B5G5R5A1_UNORM = 0x102,
-+    B5G5R5A1_UNORM_SRGB = 0x103,
-+    B4G4R4A4_UNORM = 0x104,
-+    B4G4R4A4_UNORM_SRGB = 0x105,
-+    R8G8_UNORM = 0x106,
-+    R8G8_SNORM = 0x107,
-+    R8G8_SINT = 0x108,
-+    R8G8_UINT = 0x109,
-+    R16_UNORM = 0x10A,
-+    R16_SNORM = 0x10B,
-+    R16_SINT = 0x10C,
-+    R16_UINT = 0x10D,
-+    R16_FLOAT = 0x10E,
-+    A16_UNORM = 0x113,
-+    A16_FLOAT = 0x117,
-+    B5G5R5X1_UNORM = 0x11A,
-+    B5G5R5X1_UNORM_SRGB = 0x11B,
-+    R8G8_SSCALED = 0x11C,
-+    R8G8_USCALED = 0x11D,
-+    R16_SSCALED = 0x11E,
-+    R16_USCALED = 0x11F,
-+    R8_UNORM = 0x140,
-+    R8_SNORM = 0x141,
-+    R8_SINT = 0x142,
-+    R8_UINT = 0x143,
-+    A8_UNORM = 0x144,
-+    R8_SSCALED = 0x149,
-+    R8_USCALED = 0x14A,
-+    YCRCB_SWAPUVY = 0x183,
-+    BC1_UNORM = 0x186,
-+    BC2_UNORM = 0x187,
-+    BC3_UNORM = 0x188,
-+    BC4_UNORM = 0x189,
-+    BC5_UNORM = 0x18A,
-+    BC1_UNORM_SRGB = 0x18B,
-+    BC2_UNORM_SRGB = 0x18C,
-+    BC3_UNORM_SRGB = 0x18D,
-+    YCRCB_SWAPUV = 0x18F,
-+    R8G8B8_UNORM = 0x193,
-+    R8G8B8_SNORM = 0x194,
-+    R8G8B8_SSCALED = 0x195,
-+    R8G8B8_USCALED = 0x196,
-+    BC4_SNORM = 0x199,
-+    BC5_SNORM = 0x19A,
-+    R16G16B16_FLOAT = 0x19B,
-+    R16G16B16_UNORM = 0x19C,
-+    R16G16B16_SNORM = 0x19D,
-+    R16G16B16_SSCALED = 0x19E,
-+    R16G16B16_USCALED = 0x19F,
-+    BC7_UNORM = 0x1A2,
-+    BC7_UNORM_SRGB = 0x1A3,
-+    R8G8B8_UNORM_SRGB = 0x1A8,
-+    R16G16B16_UINT = 0x1B0,
-+    R16G16B16_SINT = 0x1B1,
-+    R10G10B10A2_SNORM = 0x1B3,
-+    R10G10B10A2_USCALED = 0x1B4,
-+    R10G10B10A2_SSCALED = 0x1B5,
-+    R10G10B10A2_SINT = 0x1B6,
-+    B10G10R10A2_SNORM = 0x1B7,
-+    B10G10R10A2_USCALED = 0x1B8,
-+    B10G10R10A2_SSCALED = 0x1B9,
-+    B10G10R10A2_UINT = 0x1BA,
-+    B10G10R10A2_SINT = 0x1BB,
-+    R8G8B8_UINT = 0x1C8,
-+    R8G8B8_SINT = 0x1C9,
-+    NUM_SWR_FORMATS = 0x1CA,
-+};
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_FORMAT_INFO - Format information
-+//////////////////////////////////////////////////////////////////////////
-+struct SWR_FORMAT_INFO
-+{
-+    const char* name;
-+    SWR_TYPE type[4];
-+    uint32_t defaults[4];
-+    uint32_t swizzle[4]; ///< swizzle per component
-+    uint32_t bpc[4];     ///< bits per component
-+    uint32_t bpp;        ///< bits per pixel
-+    uint32_t Bpp;        ///< bytes per pixel
-+    uint32_t numComps;   ///< number of components
-+    bool isSRGB;
-+    bool isBC;
-+    bool isSubsampled;
-+    bool isNormalized[4];
-+    float toFloat[4];
-+    uint32_t bcWidth;
-+    uint32_t bcHeight;
-+};
-+
-+extern const SWR_FORMAT_INFO gFormatInfo[];
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Retrieves format info struct for given format.
-+/// @param format - SWR format
-+INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
-+{
-+    return gFormatInfo[format];
-+}
-+
-+// lookup table for unorm8 srgb -> float conversion
-+extern const uint32_t srgb8Table[256];
-diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
-new file mode 100644
-index 0000000..ef38179
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
-@@ -0,0 +1,235 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#pragma once
-+
-+#include <iostream>
-+#include <vector>
-+#include <bitset>
-+#include <array>
-+#include <string>
-+#include <algorithm>
-+
-+#if defined(_WIN32)
-+#include <intrin.h>
-+#else
-+#include <string.h>
-+#include <cpuid.h>
-+#endif
-+
-+class InstructionSet
-+{
-+public:
-+    InstructionSet() : CPU_Rep() {};
-+
-+    // getters
-+    std::string Vendor(void) { return CPU_Rep.vendor_; }
-+    std::string Brand(void) { return CPU_Rep.brand_; }
-+
-+    bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
-+    bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
-+    bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
-+    bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
-+    bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
-+    bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
-+    bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
-+    bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
-+    bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
-+    bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
-+    bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
-+    bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
-+    bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
-+    bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
-+
-+    bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
-+    bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
-+    bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
-+    bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
-+    bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
-+    bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
-+    bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
-+    bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
-+    bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
-+
-+    bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
-+    bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
-+    bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
-+    bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
-+    bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
-+    bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
-+    bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
-+    bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
-+    bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
-+    bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
-+
-+    bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
-+
-+    bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
-+    bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
-+    bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
-+    bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
-+    bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
-+    bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
-+
-+    bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
-+    bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
-+    bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
-+    bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
-+    bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
-+
-+    bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
-+    bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
-+    bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
-+    bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
-+    bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
-+    bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
-+    bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
-+
-+private:
-+    class InstructionSet_Internal
-+    {
-+    public:
-+        InstructionSet_Internal()
-+            : nIds_{ 0 },
-+            nExIds_{ 0 },
-+            isIntel_{ false },
-+            isAMD_{ false },
-+            f_1_ECX_{ 0 },
-+            f_1_EDX_{ 0 },
-+            f_7_EBX_{ 0 },
-+            f_7_ECX_{ 0 },
-+            f_81_ECX_{ 0 },
-+            f_81_EDX_{ 0 },
-+            data_{},
-+            extdata_{}
-+        {
-+            //int cpuInfo[4] = {-1};
-+            std::array<int, 4> cpui;
-+
-+            // Calling __cpuid with 0x0 as the function_id argument
-+            // gets the number of the highest valid function ID.
-+#if defined(_WIN32)
-+            __cpuid(cpui.data(), 0);
-+            nIds_ = cpui[0];
-+#else
-+            nIds_ = __get_cpuid_max(0, NULL);
-+#endif
-+
-+            for (int i = 0; i <= nIds_; ++i)
-+            {
-+#if defined(_WIN32)
-+                __cpuidex(cpui.data(), i, 0);
-+#else
-+                int *data = cpui.data();
-+                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
-+#endif
-+                data_.push_back(cpui);
-+            }
-+
-+            // Capture vendor string
-+            char vendor[0x20];
-+            memset(vendor, 0, sizeof(vendor));
-+            *reinterpret_cast<int*>(vendor) = data_[0][1];
-+            *reinterpret_cast<int*>(vendor + 4) = data_[0][3];
-+            *reinterpret_cast<int*>(vendor + 8) = data_[0][2];
-+            vendor_ = vendor;
-+            if (vendor_ == "GenuineIntel")
-+            {
-+                isIntel_ = true;
-+            }
-+            else if (vendor_ == "AuthenticAMD")
-+            {
-+                isAMD_ = true;
-+            }
-+
-+            // load bitset with flags for function 0x00000001
-+            if (nIds_ >= 1)
-+            {
-+                f_1_ECX_ = data_[1][2];
-+                f_1_EDX_ = data_[1][3];
-+            }
-+
-+            // load bitset with flags for function 0x00000007
-+            if (nIds_ >= 7)
-+            {
-+                f_7_EBX_ = data_[7][1];
-+                f_7_ECX_ = data_[7][2];
-+            }
-+
-+            // Calling __cpuid with 0x80000000 as the function_id argument
-+            // gets the number of the highest valid extended ID.
-+#if defined(_WIN32)
-+            __cpuid(cpui.data(), 0x80000000);
-+            nExIds_ = cpui[0];
-+#else
-+            nExIds_ = __get_cpuid_max(0x80000000, NULL);
-+#endif
-+
-+            char brand[0x40];
-+            memset(brand, 0, sizeof(brand));
-+
-+            for (unsigned i = 0x80000000; i <= nExIds_; ++i)
-+            {
-+#if defined(_WIN32)
-+                __cpuidex(cpui.data(), i, 0);
-+#else
-+                int *data = cpui.data();
-+                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
-+#endif
-+                extdata_.push_back(cpui);
-+            }
-+
-+            // load bitset with flags for function 0x80000001
-+            if (nExIds_ >= 0x80000001)
-+            {
-+                f_81_ECX_ = extdata_[1][2];
-+                f_81_EDX_ = extdata_[1][3];
-+            }
-+
-+            // Interpret CPU brand string if reported
-+            if (nExIds_ >= 0x80000004)
-+            {
-+                memcpy(brand, extdata_[2].data(), sizeof(cpui));
-+                memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
-+                memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
-+                brand_ = brand;
-+            }
-+        };
-+
-+        int nIds_;
-+        unsigned nExIds_;
-+        std::string vendor_;
-+        std::string brand_;
-+        bool isIntel_;
-+        bool isAMD_;
-+        std::bitset<32> f_1_ECX_;
-+        std::bitset<32> f_1_EDX_;
-+        std::bitset<32> f_7_EBX_;
-+        std::bitset<32> f_7_ECX_;
-+        std::bitset<32> f_81_ECX_;
-+        std::bitset<32> f_81_EDX_;
-+        std::vector<std::array<int, 4>> data_;
-+        std::vector<std::array<int, 4>> extdata_;
-+    };
-+    const InstructionSet_Internal CPU_Rep;
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
-new file mode 100644
-index 0000000..d7def2b
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
-@@ -0,0 +1,194 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#ifndef __SWR_OS_H__
-+#define __SWR_OS_H__
-+
-+#include "core/knobs.h"
-+
-+#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
-+
-+#define SWR_API __cdecl
-+
-+#ifndef _CRT_SECURE_NO_WARNINGS
-+#define _CRT_SECURE_NO_WARNINGS
-+#endif
-+
-+#ifndef NOMINMAX
-+#define NOMINMAX
-+#endif
-+#include "Windows.h"
-+#include <intrin.h>
-+#include <cstdint>
-+
-+#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
-+#define THREAD __declspec(thread)
-+#define INLINE __forceinline
-+#define DEBUGBREAK __debugbreak()
-+
-+#define PRAGMA_WARNING_PUSH_DISABLE(...) \
-+	__pragma(warning(push));\
-+	__pragma(warning(disable:__VA_ARGS__));
-+
-+#define PRAGMA_WARNING_POP() __pragma(warning(pop))
-+
-+#if defined(_WIN32)
-+#if defined(_WIN64)
-+#define BitScanForwardSizeT BitScanForward64
-+#define _mm_popcount_sizeT _mm_popcnt_u64
-+#else
-+#define BitScanForwardSizeT BitScanForward
-+#define _mm_popcount_sizeT _mm_popcnt_u32
-+#endif
-+#endif
-+
-+#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-+
-+#define SWR_API
-+
-+#include <stdlib.h>
-+#include <string.h>
-+#include <X11/Xmd.h>
-+#include <x86intrin.h>
-+#include <stdint.h>
-+
-+typedef void			VOID;
-+typedef void*           LPVOID;
-+typedef CARD8			BOOL;
-+typedef wchar_t			WCHAR;
-+typedef uint16_t		UINT16;
-+typedef int				INT;
-+typedef int             INT32;
-+typedef unsigned int	UINT;
-+typedef uint32_t		UINT32;
-+typedef uint64_t		UINT64;
-+typedef int64_t		    INT64;
-+typedef void*			HANDLE;
-+typedef float			FLOAT;
-+typedef int			    LONG;
-+typedef CARD8		    BYTE;
-+typedef unsigned char   UCHAR;
-+typedef unsigned int	DWORD;
-+
-+#undef FALSE
-+#define FALSE 0
-+
-+#undef TRUE
-+#define TRUE 1
-+
-+#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
-+#define THREAD __thread
-+#ifndef INLINE
-+#define INLINE __inline
-+#endif
-+#define DEBUGBREAK asm ("int $3")
-+#define __cdecl
-+#define __declspec(X)
-+
-+#define GCC_VERSION (__GNUC__ * 10000 \
-+                     + __GNUC_MINOR__ * 100 \
-+                     + __GNUC_PATCHLEVEL__)
-+
-+#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
-+inline
-+uint64_t __rdtsc()
-+{
-+    long low, high;
-+    asm volatile("rdtsc" : "=a"(low), "=d"(high));
-+    return (low | ((uint64_t)high << 32));
-+}
-+#endif
-+
-+// Intrinsic not defined in gcc
-+static INLINE
-+void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a)
-+{
-+    _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
-+    _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
-+}
-+
-+inline
-+unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask)
-+{
-+    *Index = __builtin_ctz(Mask);
-+    return (Mask != 0);
-+}
-+
-+inline
-+unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask)
-+{
-+    *Index = __builtin_clz(Mask);
-+    return (Mask != 0);
-+}
-+
-+inline
-+void *_aligned_malloc(unsigned int size, unsigned int alignment)
-+{
-+    void *ret;
-+    if (posix_memalign(&ret, alignment, size))
-+    {
-+        return NULL;
-+    }
-+    return ret;
-+}
-+
-+inline
-+unsigned char _bittest(const LONG *a, LONG b)
-+{
-+    return ((*(unsigned *)(a) & (1 << b)) != 0);
-+}
-+
-+#if defined(_WIN32)
-+static inline
-+unsigned int _mm_popcnt_u32(unsigned int v)
-+{
-+    return __builtin_popcount(v);
-+}
-+#endif
-+
-+#define _aligned_free free
-+#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
-+#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
-+#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
-+#define _ReadWriteBarrier() asm volatile("" ::: "memory")
-+#define __stdcall
-+
-+#define PRAGMA_WARNING_PUSH_DISABLE(...)
-+#define PRAGMA_WARNING_POP()
-+
-+#else
-+
-+#error Unsupported OS/system.
-+
-+#endif
-+
-+#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
-+#if KNOB_SIMD_WIDTH == 8
-+#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32)
-+#else
-+#error Unknown SIMD width!
-+#endif
-+
-+#include "common/swr_assert.h"
-+
-+#endif//__SWR_OS_H__
-diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
-new file mode 100644
-index 0000000..469302b
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
-@@ -0,0 +1,176 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file rdtsc_buckets.cpp
-+* 
-+* @brief implementation of rdtsc buckets.
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#include "rdtsc_buckets.h"
-+#include <inttypes.h>
-+
-+THREAD UINT tlsThreadId = 0;
-+
-+void BucketManager::RegisterThread(const std::string& name)
-+{
-+    BUCKET_THREAD newThread;
-+    newThread.name = name;
-+    newThread.root.children.reserve(mBuckets.size());
-+    newThread.root.id = 0;
-+    newThread.root.pParent = nullptr;
-+    newThread.pCurrent = &newThread.root;
-+
-+    mThreadMutex.lock();
-+
-+    // assign unique thread id for this thread
-+    size_t id = mThreads.size();
-+    newThread.id = (UINT)id;
-+    tlsThreadId = (UINT)id;
-+
-+    // open threadviz file if enabled
-+    if (mThreadViz)
-+    {
-+        char fileName[255];
-+        sprintf(fileName, "threadviz_thread.%d.dat", newThread.id);
-+        newThread.vizFile = fopen(fileName, "wb");
-+    }
-+
-+    // store new thread
-+    mThreads.push_back(newThread);
-+
-+    mThreadMutex.unlock();
-+}
-+
-+UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
-+{
-+    size_t id = mBuckets.size();
-+    mBuckets.push_back(desc);
-+    return (UINT)id;
-+}
-+
-+void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
-+{
-+    const char *arrows[] = {
-+        "",
-+        "|-> ",
-+        "    |-> ",
-+        "        |-> ",
-+        "            |-> ",
-+        "                |-> ",
-+        "                    |-> "
-+    };
-+
-+    // compute percent of total cycles used by this bucket
-+    float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
-+
-+    // compute percent of parent cycles used by this bucket
-+    float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
-+
-+    // compute average cycle count per invocation
-+    UINT64 CPE = bucket.elapsed / bucket.count;
-+
-+    BUCKET_DESC &desc = mBuckets[bucket.id];
-+
-+    // construct hierarchy visualization
-+    char hier[80];
-+    strcpy(hier, arrows[level]);
-+    strcat(hier, desc.name.c_str());
-+
-+    // print out
-+    fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", percentTotal, percentParent, bucket.elapsed, CPE, bucket.count, (unsigned long)0, (UINT32)(0), hier);
-+
-+    // dump all children of this bucket
-+    for (const BUCKET& child : bucket.children)
-+    {
-+        if (child.count)
-+        {
-+            PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
-+        }
-+    }
-+}
-+
-+void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
-+{
-+    // print header
-+    fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
-+    fprintf(f, " %%Tot   %%Par  Cycles     CPE        NumEvent   CPE2       NumEvent2  Bucket\n");
-+
-+    // compute thread level total cycle counts across all buckets from root
-+    const BUCKET& root = thread.root;
-+    UINT64 totalCycles = 0;
-+    for (const BUCKET& child : root.children)
-+    {
-+        totalCycles += child.elapsed;
-+    }
-+
-+    for (const BUCKET& child : root.children)
-+    {
-+        if (child.count)
-+        {
-+            PrintBucket(f, 0, totalCycles, totalCycles, child);
-+        }
-+    }
-+}
-+
-+void BucketManager::DumpThreadViz()
-+{
-+    // ensure all thread data is flushed
-+    mThreadMutex.lock();
-+    for (auto& thread : mThreads)
-+    {
-+        fflush(thread.vizFile);
-+        fclose(thread.vizFile);
-+    }
-+    mThreadMutex.unlock();
-+
-+    // dump bucket descriptions
-+    FILE* f = fopen("threadviz_buckets.dat", "wb");
-+    for (auto& bucket : mBuckets)
-+    {
-+        Serialize(f, bucket);
-+    }
-+    fclose(f);
-+}
-+
-+void BucketManager::PrintReport(const std::string& filename)
-+{
-+    if (mThreadViz)
-+    {
-+        DumpThreadViz();
-+    }
-+    else
-+    {
-+        FILE* f = fopen(filename.c_str(), "w");
-+
-+        mThreadMutex.lock();
-+        for (const BUCKET_THREAD& thread : mThreads)
-+        {
-+            PrintThread(f, thread);
-+            fprintf(f, "\n");
-+        }
-+        mThreadMutex.unlock();
-+
-+        fclose(f);
-+    }
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
-new file mode 100644
-index 0000000..03530f5
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
-@@ -0,0 +1,195 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file rdtsc_buckets.h
-+* 
-+* @brief declaration for rdtsc buckets.
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+#include "os.h"
-+#include <vector>
-+#include <mutex>
-+
-+#include "rdtsc_buckets_shared.h"
-+
-+// unique thread id stored in thread local storage
-+extern THREAD UINT tlsThreadId;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief BucketManager encapsulates a single instance of the buckets
-+///        functionality. There can be one or many bucket managers active
-+///        at any time.  The manager owns all the threads and
-+///        bucket information that have been registered to it.
-+class BucketManager
-+{
-+public:
-+    BucketManager(bool enableThreadViz) : mThreadViz(enableThreadViz) {}
-+
-+    // removes all registered thread data
-+    void ClearThreads()
-+    {
-+        mThreadMutex.lock();
-+        mThreads.clear();
-+        mThreadMutex.unlock();
-+    }
-+
-+    // removes all registered buckets
-+    void ClearBuckets()
-+    {
-+        mBuckets.clear();
-+    }
-+
-+    /// Registers a new thread with the manager.
-+    /// @param name - name of thread, used for labels in reports and threadviz
-+    void RegisterThread(const std::string& name);
-+
-+    /// Registers a new bucket type with the manager.  Returns a unique
-+    /// id which should be used in subsequent calls to start/stop the bucket
-+    /// @param desc - description of the bucket
-+    /// @return unique id
-+    UINT RegisterBucket(const BUCKET_DESC& desc);
-+
-+    // dump threadviz data
-+    void DumpThreadViz();
-+
-+    // print report
-+    void PrintReport(const std::string& filename);
-+
-+    // start capturing
-+    INLINE void StartCapture()
-+    {
-+        mCapturing = true;
-+    }
-+
-+    // stop capturing
-+    INLINE void StopCapture()
-+    {
-+        mCapturing = false;
-+
-+        // wait for all threads to pop back to root bucket
-+        bool stillCapturing = true;
-+        while (stillCapturing)
-+        {
-+            stillCapturing = false;
-+            for (const BUCKET_THREAD& t : mThreads)
-+            {
-+                if (t.pCurrent != &t.root)
-+                {
-+                    stillCapturing = true;
-+                    continue;
-+                }
-+            }
-+        }
-+    }
-+
-+    // start a bucket
-+    // @param id generated by RegisterBucket
-+    INLINE void StartBucket(UINT id)
-+    {
-+        if (!mCapturing) return;
-+
-+        SWR_ASSERT(tlsThreadId < mThreads.size());
-+
-+        BUCKET_THREAD& bt = mThreads[tlsThreadId];
-+
-+        // if threadviz is enabled, only need to dump start info to threads viz file
-+        if (mThreadViz)
-+        {
-+            SWR_ASSERT(bt.vizFile != nullptr);
-+            if (mBuckets[id].enableThreadViz)
-+            {
-+                VIZ_START_DATA data{ VIZ_START, id, __rdtsc() };
-+                Serialize(bt.vizFile, data);
-+            }
-+        }
-+        else
-+        {
-+            if (bt.pCurrent->children.size() < mBuckets.size())
-+            {
-+                bt.pCurrent->children.resize(mBuckets.size());
-+            }
-+            BUCKET &child = bt.pCurrent->children[id];
-+            child.pParent = bt.pCurrent;
-+            child.id = id;
-+            child.start = __rdtsc();
-+
-+            // update thread's currently executing bucket
-+            bt.pCurrent = &child;
-+        }
-+
-+        bt.level++;
-+    }
-+
-+    // stop the currently executing bucket
-+    INLINE void StopBucket(UINT id)
-+    {
-+        SWR_ASSERT(tlsThreadId < mThreads.size());
-+        BUCKET_THREAD &bt = mThreads[tlsThreadId];
-+
-+        if (bt.level == 0) return;
-+
-+        if (mThreadViz)
-+        {
-+            SWR_ASSERT(bt.vizFile != nullptr);
-+            if (mBuckets[id].enableThreadViz)
-+            {
-+                VIZ_STOP_DATA data{ VIZ_STOP, __rdtsc() };
-+                Serialize(bt.vizFile, data);
-+            }
-+        }
-+        else
-+        {
-+            if (bt.pCurrent->start == 0) return;
-+            SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
-+
-+            bt.pCurrent->elapsed += (__rdtsc() - bt.pCurrent->start);
-+            bt.pCurrent->count++;
-+
-+            // pop to parent
-+            bt.pCurrent = bt.pCurrent->pParent;
-+        }
-+
-+        bt.level--;
-+    }
-+
-+private:
-+    void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
-+    void PrintThread(FILE* f, const BUCKET_THREAD& thread);
-+
-+    // list of active threads that have registered with this manager
-+    std::vector<BUCKET_THREAD> mThreads;
-+
-+    // list of buckets registered with this manager
-+    std::vector<BUCKET_DESC> mBuckets;
-+
-+    // is capturing currently enabled
-+    volatile bool mCapturing{ false };
-+
-+    std::mutex mThreadMutex;
-+
-+    // enable threadviz
-+    bool mThreadViz{ false };
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
-new file mode 100644
-index 0000000..41c6d5d
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
-@@ -0,0 +1,167 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file rdtsc_buckets.h
-+* 
-+* @brief declaration for rdtsc buckets.
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+#include <vector>
-+#include <cassert>
-+
-+struct BUCKET
-+{
-+    uint32_t id{ 0 };
-+    uint64_t start{ 0 };
-+    uint64_t elapsed{ 0 };
-+    uint32_t count{ 0 };
-+
-+    BUCKET* pParent{ nullptr };
-+    std::vector<BUCKET> children;
-+};
-+
-+struct BUCKET_DESC
-+{
-+    // name of bucket, used in reports
-+    std::string name;
-+
-+    // description of bucket, used in threadviz
-+    std::string description;
-+
-+    // enable for threadviz dumping
-+    bool enableThreadViz;
-+
-+    // threadviz color of bucket, in RGBA8_UNORM format
-+    uint32_t color;
-+};
-+
-+struct BUCKET_THREAD
-+{
-+    // name of thread, used in reports
-+    std::string name;
-+
-+    // id for this thread, assigned by the thread manager
-+    uint32_t id;
-+
-+    // root of the bucket hierarchy for this thread
-+    BUCKET root;
-+
-+    // currently executing bucket somewhere in the hierarchy
-+    BUCKET* pCurrent;
-+
-+    // currently executing hierarchy level
-+    uint32_t level{ 0 };
-+
-+    // threadviz file object
-+    FILE* vizFile{ nullptr };
-+
-+    BUCKET_THREAD() {}
-+    BUCKET_THREAD(const BUCKET_THREAD& that)
-+    {
-+        name = that.name;
-+        id = that.id;
-+        root = that.root;
-+        pCurrent = &root;
-+        vizFile = that.vizFile;
-+    }
-+};
-+
-+enum VIZ_TYPE
-+{
-+    VIZ_START = 0,
-+    VIZ_STOP  = 1,
-+    VIZ_DATA  = 2
-+};
-+
-+struct VIZ_START_DATA
-+{
-+    uint8_t type;
-+    uint32_t bucketId;
-+    uint64_t timestamp;
-+};
-+
-+struct VIZ_STOP_DATA
-+{
-+    uint8_t type;
-+    uint64_t timestamp;
-+};
-+
-+inline void Serialize(FILE* f, const VIZ_START_DATA& data)
-+{
-+    fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
-+}
-+
-+inline void Deserialize(FILE* f, VIZ_START_DATA& data)
-+{
-+    fread(&data, sizeof(VIZ_START_DATA), 1, f);
-+    assert(data.type == VIZ_START);
-+}
-+
-+inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
-+{
-+    fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
-+}
-+
-+inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
-+{
-+    fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
-+    assert(data.type == VIZ_STOP);
-+}
-+
-+inline void Serialize(FILE* f, const std::string& string)
-+{
-+    assert(string.size() <= 256);
-+
-+    uint8_t length = (uint8_t)string.size();
-+    fwrite(&length, sizeof(length), 1, f);
-+    fwrite(string.c_str(), string.size(), 1, f);
-+}
-+
-+inline void Deserialize(FILE* f, std::string& string)
-+{
-+    char cstr[256];
-+    uint8_t length;
-+    fread(&length, sizeof(length), 1, f);
-+    fread(cstr, length, 1, f);
-+    cstr[length] = 0;
-+    string.assign(cstr);
-+}
-+
-+inline void Serialize(FILE* f, const BUCKET_DESC& desc)
-+{
-+    Serialize(f, desc.name);
-+    Serialize(f, desc.description);
-+    fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
-+    fwrite(&desc.color, sizeof(desc.color), 1, f);
-+}
-+
-+inline void Deserialize(FILE* f, BUCKET_DESC& desc)
-+{
-+    Deserialize(f, desc.name);
-+    Deserialize(f, desc.description);
-+    fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
-+    fread(&desc.color, sizeof(desc.color), 1, f);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
-new file mode 100644
-index 0000000..ef7804f
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
-@@ -0,0 +1,792 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#ifndef __SWR_SIMDINTRIN_H__
-+#define __SWR_SIMDINTRIN_H__
-+
-+#include "os.h"
-+
-+#include <cassert>
-+
-+#include <emmintrin.h>
-+#include <immintrin.h>
-+#include <xmmintrin.h>
-+
-+#if KNOB_SIMD_WIDTH == 8 
-+typedef __m256 simdscalar;
-+typedef __m256i simdscalari;
-+typedef uint8_t simdmask;
-+#else
-+#error Unsupported vector width
-+#endif
-+
-+// simd vector
-+OSALIGNSIMD(union) simdvector
-+{
-+	simdscalar	v[4];
-+	struct
-+	{
-+		simdscalar x, y, z, w;
-+	};
-+
-+	simdscalar& operator[] (const int i) { return v[i]; }
-+	const simdscalar& operator[] (const int i) const { return v[i]; }
-+};
-+
-+#if KNOB_SIMD_WIDTH == 8
-+#define _simd128_maskstore_ps _mm_maskstore_ps
-+#define _simd_load_ps _mm256_load_ps
-+#define _simd_load1_ps _mm256_broadcast_ss
-+#define _simd_loadu_ps _mm256_loadu_ps
-+#define _simd_setzero_ps _mm256_setzero_ps
-+#define _simd_set1_ps	_mm256_set1_ps
-+#define _simd_blend_ps	_mm256_blend_ps
-+#define _simd_blendv_ps _mm256_blendv_ps
-+#define _simd_store_ps _mm256_store_ps
-+#define _simd_mul_ps _mm256_mul_ps
-+#define _simd_add_ps _mm256_add_ps
-+#define _simd_sub_ps _mm256_sub_ps
-+#define _simd_rsqrt_ps _mm256_rsqrt_ps
-+#define _simd_min_ps _mm256_min_ps
-+#define _simd_max_ps _mm256_max_ps
-+#define _simd_movemask_ps _mm256_movemask_ps
-+#define _simd_cvtps_epi32 _mm256_cvtps_epi32
-+#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-+#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
-+#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
-+#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
-+#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
-+#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
-+#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
-+#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
-+#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
-+#define _simd_and_ps _mm256_and_ps
-+#define _simd_or_ps _mm256_or_ps
-+
-+#define _simd_rcp_ps _mm256_rcp_ps
-+#define _simd_div_ps _mm256_div_ps
-+#define _simd_castsi_ps _mm256_castsi256_ps
-+#define _simd_andnot_ps _mm256_andnot_ps
-+#define _simd_round_ps _mm256_round_ps
-+#define _simd_castpd_ps _mm256_castpd_ps
-+#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
-+
-+#define _simd_load_sd _mm256_load_sd
-+#define _simd_movemask_pd _mm256_movemask_pd
-+#define _simd_castsi_pd _mm256_castsi256_pd
-+
-+// emulated integer simd
-+#define SIMD_EMU_EPI(func, intrin) \
-+INLINE \
-+__m256i func(__m256i a, __m256i b)\
-+{\
-+	__m128i aHi = _mm256_extractf128_si256(a, 1);\
-+	__m128i bHi = _mm256_extractf128_si256(b, 1);\
-+	__m128i aLo = _mm256_castsi256_si128(a);\
-+	__m128i bLo = _mm256_castsi256_si128(b);\
-+\
-+	__m128i subLo = intrin(aLo, bLo);\
-+	__m128i subHi = intrin(aHi, bHi);\
-+\
-+	__m256i result = _mm256_castsi128_si256(subLo);\
-+	        result = _mm256_insertf128_si256(result, subHi, 1);\
-+\
-+	return result;\
-+}
-+
-+#if (KNOB_ARCH == KNOB_ARCH_AVX)
-+#define _simd_mul_epi32 _simdemu_mul_epi32
-+#define _simd_mullo_epi32 _simdemu_mullo_epi32
-+#define _simd_sub_epi32 _simdemu_sub_epi32
-+#define _simd_sub_epi64 _simdemu_sub_epi64
-+#define _simd_min_epi32 _simdemu_min_epi32
-+#define _simd_min_epu32 _simdemu_min_epu32
-+#define _simd_max_epi32 _simdemu_max_epi32
-+#define _simd_max_epu32 _simdemu_max_epu32
-+#define _simd_add_epi32 _simdemu_add_epi32
-+#define _simd_and_si _simdemu_and_si
-+#define _simd_andnot_si _simdemu_andnot_si
-+#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
-+#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
-+#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
-+#define _simd_or_si _simdemu_or_si
-+#define _simd_castps_si _mm256_castps_si256
-+#define _simd_adds_epu8 _simdemu_adds_epu8
-+#define _simd_subs_epu8 _simdemu_subs_epu8
-+#define _simd_add_epi8 _simdemu_add_epi8
-+#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
-+#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
-+
-+SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
-+SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
-+SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
-+SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
-+SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
-+SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
-+SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
-+SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
-+SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
-+SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
-+SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
-+SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
-+SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
-+SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
-+SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
-+SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
-+SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
-+SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
-+SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
-+SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
-+
-+#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-+#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-+
-+#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
-+#define _simd_slli_epi32(a,i) _simdemu_slli_epi32<i>(a)
-+#define _simd_srai_epi32(a,i) _simdemu_srai_epi32<i>(a)
-+#define _simd_srli_epi32(a,i) _simdemu_srli_epi32<i>(a)
-+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-+
-+#define _simd128_fmadd_ps _mm_fmaddemu_ps
-+#define _simd_fmadd_ps _mm_fmaddemu256_ps
-+#define _simd_fmsub_ps _mm_fmsubemu256_ps
-+#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 
-+SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
-+
-+INLINE
-+__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
-+{
-+	__m128 res = _mm_mul_ps(a, b);
-+	res = _mm_add_ps(res, c);
-+	return res;
-+}
-+
-+INLINE
-+__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
-+{
-+	__m256 res = _mm256_mul_ps(a, b);
-+	res = _mm256_add_ps(res, c);
-+	return res;
-+}
-+
-+INLINE
-+__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
-+{
-+	__m256 res = _mm256_mul_ps(a, b);
-+	res = _mm256_sub_ps(res, c);
-+	return res;
-+}
-+
-+INLINE
-+__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
-+{
-+    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-+    simdscalar vResult;
-+    float* pResult = (float*)&vResult;
-+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-+    {
-+        uint32_t offset = pOffsets[i];
-+        offset = offset * scale;
-+        pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
-+    }
-+
-+    return vResult;
-+}
-+
-+INLINE
-+__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
-+{
-+    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-+    simdscalar vResult = vSrc;
-+    float* pResult = (float*)&vResult;
-+    DWORD index;
-+    uint32_t mask = _simd_movemask_ps(vMask);
-+    while (_BitScanForward(&index, mask))
-+    {
-+        mask &= ~(1 << index);
-+        uint32_t offset = pOffsets[index];
-+        offset = offset * scale;
-+        pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
-+    }
-+
-+    return vResult;
-+}
-+
-+INLINE
-+__m256i _simd_abs_epi32(__m256i a)
-+{
-+        __m128i aHi = _mm256_extractf128_si256(a, 1);
-+        __m128i aLo = _mm256_castsi256_si128(a);
-+        __m128i absLo = _mm_abs_epi32(aLo);
-+        __m128i absHi = _mm_abs_epi32(aHi);
-+        __m256i result = _mm256_castsi128_si256(absLo);
-+        result = _mm256_insertf128_si256(result, absHi, 1);
-+        return result;
-+}
-+#else
-+
-+#define _simd_mul_epi32 _mm256_mul_epi32
-+#define _simd_mullo_epi32 _mm256_mullo_epi32
-+#define _simd_sub_epi32 _mm256_sub_epi32
-+#define _simd_sub_epi64 _mm256_sub_epi64
-+#define _simd_min_epi32 _mm256_min_epi32
-+#define _simd_max_epi32 _mm256_max_epi32
-+#define _simd_min_epu32 _mm256_min_epu32
-+#define _simd_max_epu32 _mm256_max_epu32
-+#define _simd_add_epi32 _mm256_add_epi32
-+#define _simd_and_si _mm256_and_si256
-+#define _simd_andnot_si _mm256_andnot_si256
-+#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
-+#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
-+#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
-+#define _simd_or_si _mm256_or_si256
-+#define _simd_castps_si _mm256_castps_si256
-+
-+#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
-+#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
-+
-+#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
-+#define _simd_slli_epi32 _mm256_slli_epi32
-+#define _simd_srai_epi32 _mm256_srai_epi32
-+#define _simd_srli_epi32 _mm256_srli_epi32
-+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-+#define _simd128_fmadd_ps _mm_fmadd_ps
-+#define _simd_fmadd_ps _mm256_fmadd_ps
-+#define _simd_fmsub_ps _mm256_fmsub_ps
-+#define _simd_shuffle_epi8 _mm256_shuffle_epi8 
-+#define _simd_adds_epu8 _mm256_adds_epu8
-+#define _simd_subs_epu8 _mm256_subs_epu8
-+#define _simd_add_epi8 _mm256_add_epi8
-+#define _simd_i32gather_ps _mm256_i32gather_ps
-+#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
-+#define _simd_abs_epi32 _mm256_abs_epi32
-+
-+#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
-+#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
-+#endif
-+
-+#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
-+#define _simd_shuffle_ps _mm256_shuffle_ps
-+#define _simd_set1_epi32 _mm256_set1_epi32
-+#define _simd_set1_epi8 _mm256_set1_epi8
-+#define _simd_setzero_si _mm256_setzero_si256
-+#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-+#define _simd_store_si _mm256_store_si256
-+#define _simd_broadcast_ss _mm256_broadcast_ss
-+#define _simd_maskstore_ps _mm256_maskstore_ps
-+#define _simd_load_si _mm256_load_si256
-+#define _simd_loadu_si _mm256_loadu_si256
-+#define _simd_sub_ps _mm256_sub_ps
-+#define _simd_testz_ps _mm256_testz_ps
-+#define _simd_xor_ps _mm256_xor_ps
-+
-+
-+INLINE
-+simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
-+{
-+    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
-+}
-+
-+// convert bitmask to vector mask
-+INLINE
-+simdscalar vMask(int32_t mask)
-+{
-+    __m256i vec = _mm256_set1_epi32(mask);
-+    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-+    vec = _simd_and_si(vec, bit);
-+    vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
-+    return _simd_castsi_ps(vec);
-+}
-+
-+INLINE
-+void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
-+{
-+    OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
-+    _mm256_store_ps(rArray, r);
-+    _mm256_store_ps(sArray, s);
-+    rArray[rlane] = sArray[slane];
-+    r = _mm256_load_ps(rArray);
-+}
-+
-+template <int i>
-+__m256i _simdemu_srli_si128(__m256i a)
-+{
-+	__m128i aHi = _mm256_extractf128_si256(a, 1);
-+	__m128i aLo = _mm256_castsi256_si128(a);
-+
-+	__m128i resHi = _mm_srli_si128(aHi, i);
-+	__m128i resLo = _mm_alignr_epi8(aHi, aLo, i);
-+
-+	__m256i result = _mm256_castsi128_si256(resLo);
-+		    result = _mm256_insertf128_si256(result, resHi, 1);
-+
-+	return result;
-+}
-+
-+template <int i>
-+__m256i _simdemu_slli_epi32(__m256i a)
-+{
-+	__m128i aHi = _mm256_extractf128_si256(a, 1);
-+	__m128i aLo = _mm256_castsi256_si128(a);
-+
-+	__m128i resHi = _mm_slli_epi32(aHi, i);
-+	__m128i resLo = _mm_slli_epi32(aLo, i);
-+
-+	__m256i result = _mm256_castsi128_si256(resLo);
-+		    result = _mm256_insertf128_si256(result, resHi, 1);
-+
-+	return result;
-+}
-+
-+template <int i>
-+__m256i _simdemu_srai_epi32(__m256i a)
-+{
-+	__m128i aHi = _mm256_extractf128_si256(a, 1);
-+	__m128i aLo = _mm256_castsi256_si128(a);
-+
-+	__m128i resHi = _mm_srai_epi32(aHi, i);
-+	__m128i resLo = _mm_srai_epi32(aLo, i);
-+
-+	__m256i result = _mm256_castsi128_si256(resLo);
-+		    result = _mm256_insertf128_si256(result, resHi, 1);
-+
-+	return result;
-+}
-+
-+template <int i>
-+__m256i _simdemu_srli_epi32(__m256i a)
-+{
-+    __m128i aHi = _mm256_extractf128_si256(a, 1);
-+    __m128i aLo = _mm256_castsi256_si128(a);
-+
-+    __m128i resHi = _mm_srli_epi32(aHi, i);
-+    __m128i resLo = _mm_srli_epi32(aLo, i);
-+
-+    __m256i result = _mm256_castsi128_si256(resLo);
-+    result = _mm256_insertf128_si256(result, resHi, 1);
-+
-+    return result;
-+}
-+
-+INLINE
-+void _simdvec_transpose(simdvector &v)
-+{
-+	SWR_ASSERT(false, "Need to implement 8 wide version");
-+}
-+
-+#else
-+#error Unsupported vector width
-+#endif
-+
-+// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
-+INLINE
-+void _simdvec_load_ps(simdvector& r, const float *p)
-+{
-+	r[0] = _simd_set1_ps(p[0]);
-+	r[1] = _simd_set1_ps(p[1]);
-+	r[2] = _simd_set1_ps(p[2]);
-+	r[3] = _simd_set1_ps(p[3]);
-+}
-+
-+INLINE
-+void _simdvec_mov(simdvector& r, const simdscalar& s)
-+{
-+	r[0] = s;
-+	r[1] = s;
-+	r[2] = s;
-+	r[3] = s;
-+}
-+
-+INLINE
-+void _simdvec_mov(simdvector& r, const simdvector& v)
-+{
-+	r[0] = v[0];
-+	r[1] = v[1];
-+	r[2] = v[2];
-+	r[3] = v[3];
-+}
-+
-+// just move a lane from the source simdvector to dest simdvector
-+INLINE
-+void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
-+{
-+	_simd_mov(r[0], rlane, s[0], slane);
-+	_simd_mov(r[1], rlane, s[1], slane);
-+	_simd_mov(r[2], rlane, s[2], slane);
-+	_simd_mov(r[3], rlane, s[3], slane);
-+}
-+
-+INLINE
-+void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-+{
-+	simdscalar tmp;
-+	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
-+
-+	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
-+
-+	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-+}
-+
-+INLINE
-+void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-+{
-+	simdscalar tmp;
-+	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
-+
-+	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
-+
-+	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-+
-+	tmp	= _simd_mul_ps(v0[3], v1[3]);	// (v0.w*v1.w)
-+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-+}
-+
-+INLINE
-+simdscalar _simdvec_rcp_length_ps(const simdvector& v)
-+{
-+	simdscalar length;
-+	_simdvec_dp4_ps(length, v, v);
-+	return _simd_rsqrt_ps(length);
-+}
-+
-+INLINE
-+void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
-+{
-+	simdscalar vecLength;
-+	vecLength = _simdvec_rcp_length_ps(v);
-+
-+	r[0] = _simd_mul_ps(v[0], vecLength);
-+	r[1] = _simd_mul_ps(v[1], vecLength);
-+	r[2] = _simd_mul_ps(v[2], vecLength);
-+	r[3] = _simd_mul_ps(v[3], vecLength);
-+}
-+
-+INLINE
-+void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
-+{
-+	r[0] = _simd_mul_ps(v[0], s);
-+	r[1] = _simd_mul_ps(v[1], s);
-+	r[2] = _simd_mul_ps(v[2], s);
-+	r[3] = _simd_mul_ps(v[3], s);
-+}
-+
-+INLINE
-+void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-+{
-+	r[0] = _simd_mul_ps(v0[0], v1[0]);
-+	r[1] = _simd_mul_ps(v0[1], v1[1]);
-+	r[2] = _simd_mul_ps(v0[2], v1[2]);
-+	r[3] = _simd_mul_ps(v0[3], v1[3]);
-+}
-+
-+INLINE
-+void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-+{
-+	r[0] = _simd_add_ps(v0[0], v1[0]);
-+	r[1] = _simd_add_ps(v0[1], v1[1]);
-+	r[2] = _simd_add_ps(v0[2], v1[2]);
-+	r[3] = _simd_add_ps(v0[3], v1[3]);
-+}
-+
-+INLINE
-+void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-+{
-+	r[0] = _simd_min_ps(v0[0], s);
-+	r[1] = _simd_min_ps(v0[1], s);
-+	r[2] = _simd_min_ps(v0[2], s);
-+	r[3] = _simd_min_ps(v0[3], s);
-+}
-+
-+INLINE
-+void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-+{
-+	r[0] = _simd_max_ps(v0[0], s);
-+	r[1] = _simd_max_ps(v0[1], s);
-+	r[2] = _simd_max_ps(v0[2], s);
-+	r[3] = _simd_max_ps(v0[3], s);
-+}
-+
-+// Matrix4x4 * Vector4
-+//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
-+//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
-+//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
-+//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
-+INLINE
-+void _simd_mat4x4_vec4_multiply(
-+	simdvector& result,
-+	const float *pMatrix,
-+	const simdvector& v)
-+{
-+	simdscalar m;
-+	simdscalar r0;
-+	simdscalar r1;
-+
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-+	result[0] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-+	result[1] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-+	result[2] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-+	result[3] = r0;
-+}
-+
-+// Matrix4x4 * Vector3 - Direction Vector where w = 0.
-+//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
-+//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
-+//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
-+//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
-+INLINE
-+void _simd_mat3x3_vec3_w0_multiply(
-+	simdvector& result,
-+	const float *pMatrix,
-+	const simdvector& v)
-+{
-+	simdscalar m;
-+	simdscalar r0;
-+	simdscalar r1;
-+
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	result[0] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	result[1] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	result[2] = r0;
-+
-+	result[3] = _simd_setzero_ps();
-+}
-+
-+// Matrix4x4 * Vector3 - Position vector where w = 1.
-+//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
-+//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
-+//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
-+//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
-+INLINE
-+void _simd_mat4x4_vec3_w1_multiply(
-+	simdvector& result,
-+	const float *pMatrix,
-+	const simdvector& v)
-+{
-+	simdscalar m;
-+	simdscalar r0;
-+	simdscalar r1;
-+
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-+	result[0] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-+	result[1] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-+	result[2] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-+	result[3]	= _simd_add_ps(r0, m);			// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-+}
-+
-+INLINE
-+void _simd_mat4x3_vec3_w1_multiply(
-+	simdvector& result,
-+	const float *pMatrix,
-+	const simdvector& v)
-+{
-+	simdscalar m;
-+	simdscalar r0;
-+	simdscalar r1;
-+
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-+	result[0] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-+	result[1] = r0;
-+
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-+	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-+	result[2] = r0;
-+	result[3] = _simd_set1_ps(1.0f);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Compute plane equation vA * vX + vB * vY + vC
-+INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
-+{
-+    simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
-+    vOut = _simd_fmadd_ps(vB, vY, vOut);
-+    return vOut;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Interpolates a single component.
-+/// @param vI - barycentric I
-+/// @param vJ - barycentric J
-+/// @param pInterpBuffer - pointer to attribute barycentric coeffs
-+template<UINT Attrib, UINT Comp>
-+static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
-+{
-+    const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp];
-+    const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp];
-+    const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp];
-+
-+    simdscalar vA = _simd_broadcast_ss(pInterpA);
-+    simdscalar vB = _simd_broadcast_ss(pInterpB);
-+    simdscalar vC = _simd_broadcast_ss(pInterpC);
-+
-+    simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
-+    vC = _simd_mul_ps(vk, vC);
-+    
-+    return vplaneps(vA, vB, vC, vI, vJ);
-+}
-+
-+
-+#endif//__SWR_SIMDINTRIN_H__
-diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
-new file mode 100644
-index 0000000..8f176e1
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
-@@ -0,0 +1,141 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#include "common/os.h"
-+#include <stdarg.h>
-+#include <stdio.h>
-+#include <assert.h>
-+
-+#if defined(SWR_ENABLE_ASSERTS)
-+
-+#if defined(_WIN32)
-+#pragma comment(lib, "user32.lib")
-+#endif // _WIN32
-+
-+bool SwrAssert(
-+    bool&       enabled,
-+    const char* pExpression,
-+    const char* pFileName,
-+    uint32_t    lineNum,
-+    const char* pFmtString /* = nullptr */,
-+    ...)
-+{
-+    if (!enabled) return false;
-+
-+#if defined(_WIN32)
-+    static const int MAX_MESSAGE_LEN = 2048;
-+    char msgBuf[MAX_MESSAGE_LEN];
-+
-+    sprintf_s(msgBuf, "%s(%d): assert: %s\n", pFileName, lineNum, pExpression);
-+    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
-+    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
-+    OutputDebugStringA(msgBuf);
-+
-+    int offset = 0;
-+
-+    if (pFmtString)
-+    {
-+        va_list args;
-+        va_start(args, pFmtString);
-+        offset = _vsnprintf_s(
-+            msgBuf,
-+            sizeof(msgBuf),
-+            sizeof(msgBuf),
-+            pFmtString,
-+            args);
-+        va_end(args);
-+
-+        if (offset < 0) { return true; }
-+
-+        OutputDebugStringA("\t");
-+        OutputDebugStringA(msgBuf);
-+        OutputDebugStringA("\n");
-+    }
-+
-+    if (KNOB_ENABLE_ASSERT_DIALOGS)
-+    {
-+        int retval = sprintf_s(
-+            &msgBuf[offset],
-+            MAX_MESSAGE_LEN - offset,
-+            "\n\n"
-+            "File: %s\n"
-+            "Line: %d\n"
-+            "\n"
-+            "Expression: %s\n\n"
-+            "Cancel: Disable this assert for the remainder of the process\n"
-+            "Try Again: Break into the debugger\n"
-+            "Continue: Continue execution (but leave assert enabled)",
-+            pFileName,
-+            lineNum,
-+            pExpression);
-+
-+        if (retval < 0) { return true; }
-+
-+        offset += retval;
-+
-+        if (!IsDebuggerPresent())
-+        {
-+            sprintf_s(
-+                &msgBuf[offset],
-+                MAX_MESSAGE_LEN - offset,
-+                "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!");
-+        }
-+
-+        retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION);
-+
-+        switch (retval)
-+        {
-+        case IDCANCEL:
-+            enabled = false;
-+            return false;
-+
-+        case IDTRYAGAIN:
-+            return true;
-+
-+        case IDCONTINUE:
-+            return false;
-+        }
-+    }
-+    else
-+    {
-+        return 0 != IsDebuggerPresent();
-+    }
-+
-+#else // !_WIN32
-+    fprintf(stderr, "%s(%d): assert: %s\n", pFileName, lineNum, pExpression);
-+    if (pFmtString)
-+    {
-+        va_list args;
-+        va_start(args, pFmtString);
-+        vfprintf(stderr, pFmtString, args);
-+        va_end(args);
-+    }
-+    fflush(stderr);
-+
-+    /// @todo - Implement message box on non-Windows platforms
-+
-+#endif
-+    return true;
-+}
-+
-+#endif // SWR_ENABLE_ASSERTS
-diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
-new file mode 100644
-index 0000000..afc9f59
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
-@@ -0,0 +1,84 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#ifndef __SWR_ASSERT_H__
-+#define __SWR_ASSERT_H__
-+
-+#if !defined(__SWR_OS_H__)
-+#error swr_assert.h should not be included directly, please include "common/os.h" instead.
-+#endif
-+
-+#if !defined(SWR_ENABLE_ASSERTS)
-+
-+#if !defined(NDEBUG)
-+#define SWR_ENABLE_ASSERTS 1
-+#else
-+#define SWR_ENABLE_ASSERTS 0
-+#endif // _DEBUG
-+
-+#endif // SWR_ENABLE_ASSERTS
-+
-+#if SWR_ENABLE_ASSERTS
-+#include "assert.h"
-+
-+#if !defined(__cplusplus)
-+
-+#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
-+
-+#define SWR_ASSERT(e, ...) assert(e)
-+
-+#else
-+
-+#if defined(assert)
-+#undef assert
-+#endif
-+#define assert(exp) SWR_ASSERT(exp)
-+
-+bool SwrAssert(
-+    bool&       enabled,
-+    const char* pExpression,
-+    const char* pFileName,
-+    uint32_t    lineNum,
-+    const char* pFmtString = nullptr,
-+    ...);
-+
-+#define SWR_ASSERT(e, ...) {\
-+    bool expFailed = !(e);\
-+    if (expFailed) {\
-+        static bool swrAssertEnabled = true;\
-+        expFailed = SwrAssert(swrAssertEnabled, #e, __FILE__, __LINE__, ##__VA_ARGS__);\
-+        if (expFailed) { DEBUGBREAK; }\
-+    }\
-+}
-+
-+#endif // C++
-+
-+#else // No asserts enabled
-+
-+#define SWR_ASSERT(e, ...) {}
-+
-+#endif
-+
-+#define SWR_NOT_IMPL SWR_ASSERT(0, "%s not implemented", __FUNCTION__)
-+
-+#endif//__SWR_OS_H__
-diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
-new file mode 100644
-index 0000000..1081e28
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
-@@ -0,0 +1,1461 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file api.cpp
-+*
-+* @brief API implementation
-+*
-+******************************************************************************/
-+
-+#include <cfloat>
-+#include <cmath>
-+#include <cstdio>
-+
-+#if defined(__gnu_linux__) || defined(__linux__)
-+#include <numa.h>
-+#endif
-+
-+#include "core/api.h"
-+#include "core/backend.h"
-+#include "core/context.h"
-+#include "core/frontend.h"
-+#include "core/rasterizer.h"
-+#include "core/rdtsc_core.h"
-+#include "core/threads.h"
-+#include "core/tilemgr.h"
-+#include "core/clip.h"
-+
-+#include "common/simdintrin.h"
-+#include "common/os.h"
-+
-+void SetupDefaultState(SWR_CONTEXT *pContext);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Create SWR Context.
-+/// @param pCreateInfo - pointer to creation info.
-+HANDLE SwrCreateContext(
-+    const SWR_CREATECONTEXT_INFO* pCreateInfo)
-+{
-+    RDTSC_RESET();
-+    RDTSC_INIT(0);
-+
-+    void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
-+    memset(pContextMem, 0, sizeof(SWR_CONTEXT));
-+    SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
-+
-+    pContext->driverType = pCreateInfo->driver;
-+    pContext->privateStateSize = pCreateInfo->privateStateSize;
-+
-+    pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-+    memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-+
-+    pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-+    memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
-+
-+    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
-+    {
-+        pContext->dcRing[dc].arena.Init();
-+        pContext->dcRing[dc].inUse = false;
-+        pContext->dcRing[dc].pTileMgr = new MacroTileMgr();
-+        pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
-+
-+        pContext->dsRing[dc].arena.Init();
-+    }
-+
-+    if (!KNOB_SINGLE_THREADED)
-+    {
-+        memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
-+        memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
-+        new (&pContext->WaitLock) std::mutex();
-+        new (&pContext->FifosNotEmpty) std::condition_variable();
-+
-+        CreateThreadPool(pContext, &pContext->threadPool);
-+    }
-+
-+    // Calling createThreadPool() above can set SINGLE_THREADED
-+    if (KNOB_SINGLE_THREADED)
-+    {
-+        pContext->NumWorkerThreads = 1;
-+    }
-+
-+    // Allocate scratch space for workers.
-+    ///@note We could lazily allocate this but its rather small amount of memory.
-+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-+    {
-+        ///@todo Use numa API for allocations using numa information from thread data (if exists).
-+        pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
-+    }
-+
-+    pContext->LastRetiredId = 0;
-+    pContext->nextDrawId = 1;
-+
-+    // workers start at draw 1
-+    for (uint32_t i = 0; i < KNOB_MAX_NUM_THREADS; ++i)
-+    {
-+        pContext->WorkerFE[i] = 1;
-+        pContext->WorkerBE[i] = 1;
-+    }
-+
-+    pContext->DrawEnqueued = 1;
-+
-+    // State setup AFTER context is fully initialized
-+    SetupDefaultState(pContext);
-+
-+    // initialize hot tile manager
-+    pContext->pHotTileMgr = new HotTileMgr();
-+
-+    // initialize function pointer tables
-+    InitClearTilesTable();
-+
-+    // initialize store tiles function
-+    pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
-+    pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
-+    pContext->pfnClearTile = pCreateInfo->pfnClearTile;
-+
-+    return (HANDLE)pContext;
-+}
-+
-+void SwrDestroyContext(HANDLE hContext)
-+{
-+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-+    DestroyThreadPool(pContext, &pContext->threadPool);
-+
-+    // free the fifos
-+    for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
-+    {
-+        delete(pContext->dcRing[i].pTileMgr);
-+        delete(pContext->dcRing[i].pDispatch);
-+    }
-+
-+    // Free scratch space.
-+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-+    {
-+        _aligned_free(pContext->pScratch[i]);
-+    }
-+
-+    _aligned_free(pContext->dcRing);
-+    _aligned_free(pContext->dsRing);
-+
-+    delete(pContext->pHotTileMgr);
-+
-+    pContext->~SWR_CONTEXT();
-+    _aligned_free((SWR_CONTEXT*)hContext);
-+}
-+
-+void WakeAllThreads(SWR_CONTEXT *pContext)
-+{
-+    std::unique_lock<std::mutex> lock(pContext->WaitLock);
-+    pContext->FifosNotEmpty.notify_all();
-+    lock.unlock();
-+}
-+
-+bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
-+{
-+    // For single thread nothing should still be drawing.
-+    if (KNOB_SINGLE_THREADED) { return false; }
-+
-+    if (pDC->isCompute)
-+    {
-+        if (pDC->doneCompute)
-+        {
-+            pDC->inUse = false;
-+            return false;
-+        }
-+    }
-+
-+    // Check if backend work is done. First make sure all triangles have been binned.
-+    if (pDC->doneFE == true)
-+    {
-+        // ensure workers have all moved passed this draw
-+        for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-+        {
-+            if (pContext->WorkerFE[i] <= pDC->drawId)
-+            {
-+                return true;
-+            }
-+
-+            if (pContext->WorkerBE[i] <= pDC->drawId)
-+            {
-+                return true;
-+            }
-+        }
-+
-+        pDC->inUse = false;    // all work is done.
-+    }
-+
-+    return pDC->inUse;
-+}
-+
-+void UpdateLastRetiredId(SWR_CONTEXT *pContext)
-+{
-+    uint64_t head = pContext->LastRetiredId + 1;
-+    uint64_t tail = pContext->DrawEnqueued;
-+
-+    // There's no guarantee the DRAW_CONTEXT associated with (LastRetiredId+1) is still valid.
-+    // This is because the update to LastRetiredId can fall behind causing the range from LastRetiredId
-+    // to DrawEnqueued to exceed the size of the DRAW_CONTEXT ring. Check for this and manually increment 
-+    // the head to the oldest entry of the DRAW_CONTEXT ring
-+    if ((tail - head) > KNOB_MAX_DRAWS_IN_FLIGHT - 1)
-+    {
-+        head = tail - KNOB_MAX_DRAWS_IN_FLIGHT + 1;
-+    }
-+
-+    DRAW_CONTEXT *pDC = &pContext->dcRing[head % KNOB_MAX_DRAWS_IN_FLIGHT];
-+    while ((head < tail) && !StillDrawing(pContext, pDC))
-+    {
-+        pContext->LastRetiredId = pDC->drawId;
-+        head++;
-+        pDC = &pContext->dcRing[head % KNOB_MAX_DRAWS_IN_FLIGHT];
-+    }
-+}
-+
-+void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId)
-+{
-+    if (!KNOB_SINGLE_THREADED)
-+    {
-+        while (drawId > pContext->LastRetiredId)
-+        {
-+            WakeAllThreads(pContext);
-+            UpdateLastRetiredId(pContext);
-+        }
-+    }
-+}
-+
-+void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
-+{
-+    memcpy(&dst.state, &src.state, sizeof(API_STATE));
-+}
-+
-+void QueueDraw(SWR_CONTEXT *pContext)
-+{
-+    _ReadWriteBarrier();
-+    pContext->DrawEnqueued ++;
-+
-+    if (KNOB_SINGLE_THREADED)
-+    {
-+        // flush denormals to 0
-+        uint32_t mxcsr = _mm_getcsr();
-+        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-+
-+        std::unordered_set<uint32_t> lockedTiles;
-+        WorkOnFifoFE(pContext, 0, pContext->WorkerFE[0], 0);
-+        WorkOnFifoBE(pContext, 0, pContext->WorkerBE[0], lockedTiles);
-+
-+        // restore csr
-+        _mm_setcsr(mxcsr);
-+    }
-+    else
-+    {
-+        RDTSC_START(APIDrawWakeAllThreads);
-+        WakeAllThreads(pContext);
-+        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
-+    }
-+
-+    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
-+    pContext->pPrevDrawContext = pContext->pCurDrawContext;
-+    pContext->pCurDrawContext = nullptr;
-+}
-+
-+///@todo Combine this with QueueDraw
-+void QueueDispatch(SWR_CONTEXT *pContext)
-+{
-+    _ReadWriteBarrier();
-+    pContext->DrawEnqueued++;
-+
-+    if (KNOB_SINGLE_THREADED)
-+    {
-+        // flush denormals to 0
-+        uint32_t mxcsr = _mm_getcsr();
-+        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-+
-+        WorkOnCompute(pContext, 0, pContext->WorkerBE[0]);
-+
-+        // restore csr
-+        _mm_setcsr(mxcsr);
-+    }
-+    else
-+    {
-+        RDTSC_START(APIDrawWakeAllThreads);
-+        WakeAllThreads(pContext);
-+        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
-+    }
-+
-+    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
-+    pContext->pPrevDrawContext = pContext->pCurDrawContext;
-+    pContext->pCurDrawContext = nullptr;
-+}
-+
-+DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
-+{
-+    RDTSC_START(APIGetDrawContext);
-+    // If current draw context is null then need to obtain a new draw context to use from ring.
-+    if (pContext->pCurDrawContext == nullptr)
-+    {
-+        uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-+
-+        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
-+        pContext->pCurDrawContext = pCurDrawContext;
-+
-+        // Update LastRetiredId
-+        UpdateLastRetiredId(pContext);
-+
-+        // Need to wait until this draw context is available to use.
-+        while (StillDrawing(pContext, pCurDrawContext))
-+        {
-+            // Make sure workers are working.
-+            WakeAllThreads(pContext);
-+
-+            _mm_pause();
-+        }
-+
-+        // Assign next available entry in DS ring to this DC.
-+        uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
-+        pCurDrawContext->pState = &pContext->dsRing[dsIndex];
-+
-+        Arena& stateArena = pCurDrawContext->pState->arena;
-+
-+        // Copy previous state to current state.
-+        if (pContext->pPrevDrawContext)
-+        {
-+            DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
-+
-+            // If we're splitting our draw then we can just use the same state from the previous
-+            // draw. In this case, we won't increment the DS ring index so the next non-split
-+            // draw can receive the state.
-+            if (isSplitDraw == false)
-+            {
-+                CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
-+
-+                stateArena.Reset();    // Reset memory.
-+
-+                // Copy private state to new context.
-+                if (pPrevDrawContext->pState->pPrivateState != nullptr)
-+                {
-+                    pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
-+                    memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize);
-+                }
-+
-+                pContext->curStateId++;  // Progress state ring index forward.
-+            }
-+            else
-+            {
-+                // If its a split draw then just copy the state pointer over
-+                // since its the same draw.
-+                pCurDrawContext->pState = pPrevDrawContext->pState;
-+            }
-+        }
-+        else
-+        {
-+            stateArena.Reset();    // Reset memory.
-+            pContext->curStateId++;  // Progress state ring index forward.
-+        }
-+
-+        pCurDrawContext->dependency = 0;
-+        pCurDrawContext->arena.Reset();
-+        pCurDrawContext->pContext = pContext;
-+        pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
-+        pCurDrawContext->inUse = false;
-+
-+        pCurDrawContext->doneCompute = false;
-+        pCurDrawContext->doneFE = false;
-+        pCurDrawContext->FeLock = 0;
-+
-+        pCurDrawContext->pTileMgr->initialize();
-+
-+        // Assign unique drawId for this DC
-+        pCurDrawContext->drawId = pContext->nextDrawId++;
-+    }
-+    else
-+    {
-+        SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
-+    }
-+
-+    RDTSC_STOP(APIGetDrawContext, 0, 0);
-+    return pContext->pCurDrawContext;
-+}
-+
-+API_STATE* GetDrawState(SWR_CONTEXT *pContext)
-+{
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+    SWR_ASSERT(pDC->pState != nullptr);
-+
-+    return &pDC->pState->state;
-+}
-+
-+void SetupDefaultState(SWR_CONTEXT *pContext)
-+{
-+    API_STATE* pState = GetDrawState(pContext);
-+
-+    pState->rastState.cullMode = SWR_CULLMODE_NONE;
-+    pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
-+}
-+
-+static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
-+{
-+    return (SWR_CONTEXT*)hContext;
-+}
-+
-+void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2)
-+{
-+    RDTSC_START(APISync);
-+
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+
-+    pDC->inUse = true;
-+
-+    pDC->FeWork.type = SYNC;
-+    pDC->FeWork.pfnWork = ProcessSync;
-+    pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
-+    pDC->FeWork.desc.sync.userData = userData;
-+    pDC->FeWork.desc.sync.userData2 = userData2;
-+
-+    // cannot execute until all previous draws have completed
-+    pDC->dependency = pDC->drawId - 1;
-+
-+    //enqueue
-+    QueueDraw(pContext);
-+
-+    RDTSC_STOP(APISync, 1, 0);
-+}
-+
-+void SwrWaitForIdle(HANDLE hContext)
-+{
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+
-+    // Wait on the previous DrawContext's drawId, as this function doesn't queue anything.
-+    if (pContext->pPrevDrawContext)
-+        WaitForDependencies(pContext, pContext->pPrevDrawContext->drawId);
-+}
-+
-+void SwrSetVertexBuffers(
-+    HANDLE hContext,
-+    uint32_t numBuffers,
-+    const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    for (uint32_t i = 0; i < numBuffers; ++i)
-+    {
-+        const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
-+        pState->vertexBuffers[pVB->index] = *pVB;
-+    }
-+}
-+
-+void SwrSetIndexBuffer(
-+    HANDLE hContext,
-+    const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    pState->indexBuffer = *pIndexBuffer;
-+}
-+
-+void SwrSetFetchFunc(
-+    HANDLE hContext,
-+    PFN_FETCH_FUNC    pfnFetchFunc)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    pState->pfnFetchFunc = pfnFetchFunc;
-+}
-+
-+void SwrSetSoFunc(
-+    HANDLE hContext,
-+    PFN_SO_FUNC    pfnSoFunc,
-+    uint32_t streamIndex)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
-+
-+    pState->pfnSoFunc[streamIndex] = pfnSoFunc;
-+}
-+
-+void SwrSetSoState(
-+    HANDLE hContext,
-+    SWR_STREAMOUT_STATE* pSoState)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    pState->soState = *pSoState;
-+}
-+
-+void SwrSetSoBuffers(
-+    HANDLE hContext,
-+    SWR_STREAMOUT_BUFFER* pSoBuffer,
-+    uint32_t slot)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
-+
-+    pState->soBuffer[slot] = *pSoBuffer;
-+}
-+
-+void SwrSetVertexFunc(
-+    HANDLE hContext,
-+    PFN_VERTEX_FUNC pfnVertexFunc)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    pState->pfnVertexFunc = pfnVertexFunc;
-+}
-+
-+void SwrSetFrontendState(
-+    HANDLE hContext,
-+    SWR_FRONTEND_STATE *pFEState)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+    pState->frontendState = *pFEState;
-+}
-+
-+void SwrSetGsState(
-+    HANDLE hContext,
-+    SWR_GS_STATE *pGSState)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+    pState->gsState = *pGSState;
-+}
-+
-+void SwrSetGsFunc(
-+    HANDLE hContext,
-+    PFN_GS_FUNC pfnGsFunc)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+    pState->pfnGsFunc = pfnGsFunc;
-+}
-+
-+void SwrSetCsFunc(
-+    HANDLE hContext,
-+    PFN_CS_FUNC pfnCsFunc,
-+    uint32_t totalThreadsInGroup)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+    pState->pfnCsFunc = pfnCsFunc;
-+    pState->totalThreadsInGroup = totalThreadsInGroup;
-+}
-+
-+void SwrSetTsState(
-+    HANDLE hContext,
-+    SWR_TS_STATE *pState)
-+{
-+    API_STATE* pApiState = GetDrawState(GetContext(hContext));
-+    pApiState->tsState = *pState;
-+}
-+
-+void SwrSetHsFunc(
-+    HANDLE hContext,
-+    PFN_HS_FUNC pfnFunc)
-+{
-+    API_STATE* pApiState = GetDrawState(GetContext(hContext));
-+    pApiState->pfnHsFunc = pfnFunc;
-+}
-+
-+void SwrSetDsFunc(
-+    HANDLE hContext,
-+    PFN_DS_FUNC pfnFunc)
-+{
-+    API_STATE* pApiState = GetDrawState(GetContext(hContext));
-+    pApiState->pfnDsFunc = pfnFunc;
-+}
-+
-+void SwrSetDepthStencilState(
-+    HANDLE hContext,
-+    SWR_DEPTH_STENCIL_STATE *pDSState)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    pState->depthStencilState = *pDSState;
-+}
-+
-+void SwrSetBackendState(
-+    HANDLE hContext,
-+    SWR_BACKEND_STATE *pBEState)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    pState->backendState = *pBEState;
-+}
-+
-+void SwrSetPixelShaderState(
-+    HANDLE hContext,
-+    SWR_PS_STATE *pPSState)
-+{
-+    API_STATE *pState = GetDrawState(GetContext(hContext));
-+    pState->psState = *pPSState;
-+}
-+
-+void SwrSetBlendState(
-+    HANDLE hContext,
-+    SWR_BLEND_STATE *pBlendState)
-+{
-+    API_STATE *pState = GetDrawState(GetContext(hContext));
-+    memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
-+}
-+
-+void SwrSetBlendFunc(
-+    HANDLE hContext,
-+    uint32_t renderTarget,
-+    PFN_BLEND_JIT_FUNC pfnBlendFunc)
-+{
-+    SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
-+    API_STATE *pState = GetDrawState(GetContext(hContext));
-+    pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
-+}
-+
-+void SwrSetLinkage(
-+    HANDLE hContext,
-+    uint32_t mask,
-+    const uint8_t* pMap)
-+{
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+
-+    static const uint8_t IDENTITY_MAP[] =
-+    {
-+         0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-+        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-+    };
-+    static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
-+        "Update for new value of MAX_ATTRIBUTES");
-+
-+    pState->linkageMask = mask;
-+    pState->linkageCount = _mm_popcnt_u32(mask);
-+
-+    if (!pMap)
-+    {
-+        pMap = IDENTITY_MAP;
-+    }
-+    memcpy(pState->linkageMap, pMap, pState->linkageCount);
-+}
-+
-+// update guardband multipliers for the viewport
-+void updateGuardband(API_STATE *pState)
-+{
-+    // guardband center is viewport center
-+    pState->gbState.left    = KNOB_GUARDBAND_WIDTH  / pState->vp[0].width;
-+    pState->gbState.right   = KNOB_GUARDBAND_WIDTH  / pState->vp[0].width;
-+    pState->gbState.top     = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
-+    pState->gbState.bottom  = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
-+}
-+
-+void SwrSetRastState(
-+    HANDLE hContext,
-+    const SWR_RASTSTATE *pRastState)
-+{
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+    API_STATE* pState = GetDrawState(pContext);
-+
-+    memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
-+}
-+
-+void SwrSetViewports(
-+    HANDLE hContext,
-+    uint32_t numViewports,
-+    const SWR_VIEWPORT* pViewports,
-+    const SWR_VIEWPORT_MATRIX* pMatrices)
-+{
-+    SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
-+        "Invalid number of viewports.");
-+
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+    API_STATE* pState = GetDrawState(pContext);
-+
-+    memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
-+
-+    if (pMatrices != nullptr)
-+    {
-+        memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
-+    }
-+    else
-+    {
-+        // Compute default viewport transform.
-+        for (uint32_t i = 0; i < numViewports; ++i)
-+        {
-+            if (pContext->driverType == DX)
-+            {
-+                pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
-+                pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
-+                pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
-+                pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
-+                pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
-+                pState->vpMatrix[i].m32 = pState->vp[i].minZ;
-+            }
-+            else
-+            {
-+                // Standard, with the exception that Y is inverted.
-+                pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
-+                pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
-+                pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
-+                pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
-+                pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
-+                pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
-+
-+                // Now that the matrix is calculated, clip the view coords to screen size.
-+                // OpenGL allows for -ve x,y in the viewport.
-+                pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
-+                pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
-+            }
-+        }
-+    }
-+
-+    updateGuardband(pState);
-+}
-+
-+void SwrSetScissorRects(
-+    HANDLE hContext,
-+    uint32_t numScissors,
-+    const BBOX* pScissors)
-+{
-+    SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
-+        "Invalid number of scissor rects.");
-+
-+    API_STATE* pState = GetDrawState(GetContext(hContext));
-+    memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
-+};
-+
-+void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
-+{
-+    API_STATE *pState = &pDC->pState->state;
-+    uint32_t left, right, top, bottom;
-+
-+    // Set up scissor dimensions based on scissor or viewport
-+    if (pState->rastState.scissorEnable)
-+    {
-+        // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
-+        left = pState->scissorRects[0].left;
-+        right = pState->scissorRects[0].right;
-+        top = pState->scissorRects[0].top;
-+        bottom = pState->scissorRects[0].bottom;
-+    }
-+    else
-+    {
-+        left = (int32_t)pState->vp[0].x;
-+        right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
-+        top = (int32_t)pState->vp[0].y;
-+        bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
-+    }
-+
-+    pState->scissorInFixedPoint.left   = left * FIXED_POINT_SCALE;
-+    pState->scissorInFixedPoint.right  = right * FIXED_POINT_SCALE - 1;
-+    pState->scissorInFixedPoint.top    = top * FIXED_POINT_SCALE;
-+    pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
-+}
-+
-+void SetupPipeline(DRAW_CONTEXT *pDC)
-+{
-+    DRAW_STATE* pState = pDC->pState;
-+
-+    // setup backend
-+    if (pState->state.psState.pfnPixelShader == nullptr)
-+    {
-+        pState->pfnBackend = &BackendNullPS;
-+    }
-+    else
-+    {
-+        bool bMultisampleEnable = (pState->state.rastState.sampleCount > SWR_MULTISAMPLE_1X) ? 1 : 0;
-+
-+        // select backend function based on max slot used by PS
-+        switch(pState->state.psState.shadingRate)
-+        {
-+        case SWR_SHADING_RATE_PIXEL:
-+            if(bMultisampleEnable)
-+            {
-+                pState->pfnBackend = gPixelRateBackendTable[pState->state.rastState.sampleCount-1][pState->state.psState.maxRTSlotUsed];
-+            }
-+            else
-+            {
-+                pState->pfnBackend = gSingleSampleBackendTable[pState->state.psState.maxRTSlotUsed];
-+            }
-+            break;
-+        case SWR_SHADING_RATE_SAMPLE:
-+            ///@todo Do we need to obey sample rate
-+            if (!bMultisampleEnable)
-+            {
-+                // If PS is set at per sample rate and multisampling is disabled, set to per pixel and single sample backend
-+                pState->state.psState.shadingRate = SWR_SHADING_RATE_PIXEL;
-+                pState->pfnBackend = gSingleSampleBackendTable[pState->state.psState.maxRTSlotUsed];
-+            }
-+            else
-+            {
-+                pState->pfnBackend = gSampleRateBackendTable[pState->state.rastState.sampleCount-1][pState->state.psState.maxRTSlotUsed];
-+            }
-+            break;
-+        case SWR_SHADING_RATE_COARSE:
-+        default:
-+            assert(0 && "Invalid shading rate");
-+            break;
-+        }
-+    }
-+
-+    PFN_PROCESS_PRIMS pfnBinner;
-+    switch (pState->state.topology)
-+    {
-+    case TOP_POINT_LIST:
-+        pState->pfnProcessPrims = CanUseSimplePoints(pDC) ? ClipPoints : ClipTriangles;
-+        pfnBinner = CanUseSimplePoints(pDC) ? BinPoints : BinTriangles;
-+        break;
-+    case TOP_LINE_LIST:
-+    case TOP_LINE_STRIP:
-+    case TOP_LINE_LOOP:
-+    case TOP_LINE_LIST_ADJ:
-+    case TOP_LISTSTRIP_ADJ:
-+        pState->pfnProcessPrims = ClipLines;
-+        pfnBinner = BinLines;
-+        break;
-+    default:
-+        pState->pfnProcessPrims = ClipTriangles;
-+        pfnBinner = BinTriangles;
-+        break;
-+    };
-+
-+    // disable clipper if viewport transform is disabled
-+    if (pState->state.frontendState.vpTransformDisable)
-+    {
-+        pState->pfnProcessPrims = pfnBinner;
-+    }
-+
-+    if ((pState->state.psState.pfnPixelShader == nullptr) &&
-+        (pState->state.depthStencilState.depthTestEnable == FALSE) &&
-+        (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
-+        (pState->state.linkageCount == 0))
-+    {
-+        pState->pfnProcessPrims = nullptr;
-+        pState->state.linkageMask = 0;
-+    }
-+
-+    if (pState->state.soState.rasterizerDisable == true)
-+    {
-+        pState->pfnProcessPrims = nullptr;
-+        pState->state.linkageMask = 0;
-+    }
-+
-+    // set up the frontend attrib mask
-+    pState->state.feAttribMask = pState->state.linkageMask;
-+    if (pState->state.soState.soEnable)
-+    {
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
-+        }
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief InitDraw
-+/// @param pDC - Draw context to initialize for this draw.
-+void InitDraw(
-+    DRAW_CONTEXT *pDC,
-+    bool isSplitDraw)
-+{
-+    // We don't need to re-setup the scissors/pipeline state again for split draw.
-+    if (isSplitDraw == false)
-+    {
-+        SetupMacroTileScissors(pDC);
-+        SetupPipeline(pDC);
-+    }
-+
-+    pDC->inUse = true;    // We are using this one now.
-+
-+    /// @todo: remove when we send down preset sample patterns (standard or center)
-+    // If multisampling is enabled, precompute float sample offsets from fixed
-+    uint32_t numSamples = pDC->pState->state.rastState.sampleCount;
-+    if(numSamples > SWR_MULTISAMPLE_1X)
-+    {
-+        static const float fixed8Scale = 1.0f/FIXED_POINT_SCALE;
-+        float* pSamplePos = pDC->pState->state.samplePos;
-+        SWR_MULTISAMPLE_POS(&iSamplePos)[SWR_MAX_NUM_MULTISAMPLES] = pDC->pState->state.rastState.iSamplePos;
-+
-+        for(uint32_t i = 0; i < numSamples; i++)
-+        {
-+            *(pSamplePos++) = ((float)(iSamplePos[i].x) * fixed8Scale);
-+            *(pSamplePos++) = ((float)(iSamplePos[i].y) * fixed8Scale);
-+        }
-+    }
-+    // just test the masked off samples once per draw and use the results in the backend.
-+    SWR_RASTSTATE &rastState = pDC->pState->state.rastState;
-+    uint32_t sampleMask = rastState.sampleMask;
-+    for(uint32_t i = 0; i < SWR_MAX_NUM_MULTISAMPLES; i++)
-+    {
-+        rastState.isSampleMasked[i] = !(sampleMask & 1);
-+        sampleMask>>=1;
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief We can split the draw for certain topologies for better performance.
-+/// @param totalVerts - Total vertices for draw
-+/// @param topology - Topology used for draw
-+uint32_t MaxVertsPerDraw(
-+    DRAW_CONTEXT* pDC,
-+    uint32_t totalVerts,
-+    PRIMITIVE_TOPOLOGY topology)
-+{
-+    API_STATE& state = pDC->pState->state;
-+
-+    uint32_t vertsPerDraw = totalVerts;
-+
-+    if (state.soState.soEnable)
-+    {
-+        return totalVerts;
-+    }
-+
-+    switch (topology)
-+    {
-+    case TOP_POINT_LIST:
-+    case TOP_TRIANGLE_LIST:
-+        vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
-+        break;
-+
-+    case TOP_PATCHLIST_1:
-+    case TOP_PATCHLIST_2:
-+    case TOP_PATCHLIST_3:
-+    case TOP_PATCHLIST_4:
-+    case TOP_PATCHLIST_5:
-+    case TOP_PATCHLIST_6:
-+    case TOP_PATCHLIST_7:
-+    case TOP_PATCHLIST_8:
-+    case TOP_PATCHLIST_9:
-+    case TOP_PATCHLIST_10:
-+    case TOP_PATCHLIST_11:
-+    case TOP_PATCHLIST_12:
-+    case TOP_PATCHLIST_13:
-+    case TOP_PATCHLIST_14:
-+    case TOP_PATCHLIST_15:
-+    case TOP_PATCHLIST_16:
-+    case TOP_PATCHLIST_17:
-+    case TOP_PATCHLIST_18:
-+    case TOP_PATCHLIST_19:
-+    case TOP_PATCHLIST_20:
-+    case TOP_PATCHLIST_21:
-+    case TOP_PATCHLIST_22:
-+    case TOP_PATCHLIST_23:
-+    case TOP_PATCHLIST_24:
-+    case TOP_PATCHLIST_25:
-+    case TOP_PATCHLIST_26:
-+    case TOP_PATCHLIST_27:
-+    case TOP_PATCHLIST_28:
-+    case TOP_PATCHLIST_29:
-+    case TOP_PATCHLIST_30:
-+    case TOP_PATCHLIST_31:
-+    case TOP_PATCHLIST_32:
-+        if (pDC->pState->state.tsState.tsEnable)
-+        {
-+            uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
-+            vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
-+        }
-+        break;
-+
-+    default:
-+        // We are not splitting up draws for other topologies.
-+        break;
-+    }
-+
-+    return vertsPerDraw;
-+}
-+
-+// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
-+// arguments to static template arguments.
-+template <bool... ArgsB>
-+struct FEDrawChooser
-+{
-+    // Last Arg Terminator
-+    static PFN_FE_WORK_FUNC GetFunc(bool bArg)
-+    {
-+        if (bArg)
-+        {
-+            return ProcessDraw<ArgsB..., true>;
-+        }
-+
-+        return ProcessDraw<ArgsB..., false>;
-+    }
-+
-+    // Recursively parse args
-+    template <typename... TArgsT>
-+    static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
-+    {
-+        if (bArg)
-+        {
-+            return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
-+        }
-+
-+        return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
-+    }
-+};
-+
-+// Selector for correct templated Draw front-end function
-+INLINE
-+static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
-+{
-+    return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
-+}
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief DrawInstanced
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
-+/// @param startVertex - Specifies start vertex for draw. (vertex data)
-+/// @param numInstances - How many instances to render.
-+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-+void DrawInstanced(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numVertices,
-+    uint32_t startVertex,
-+    uint32_t numInstances = 1,
-+    uint32_t startInstance = 0)
-+{
-+    RDTSC_START(APIDraw);
-+
-+#if KNOB_ENABLE_TOSS_POINTS
-+    if (KNOB_TOSS_DRAW)
-+    {
-+        return;
-+    }
-+#endif
-+
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+
-+    int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
-+    uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
-+    int32_t remainingVerts = numVertices;
-+
-+    API_STATE    *pState = &pDC->pState->state;
-+    pState->topology = topology;
-+    pState->forceFront = false;
-+
-+    // disable culling for points/lines
-+    uint32_t oldCullMode = pState->rastState.cullMode;
-+    if (topology == TOP_POINT_LIST)
-+    {
-+        pState->rastState.cullMode = SWR_CULLMODE_NONE;
-+        pState->forceFront = true;
-+    }
-+
-+    int draw = 0;
-+    while (remainingVerts)
-+    {
-+        uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
-+        remainingVerts : maxVertsPerDraw;
-+
-+        bool isSplitDraw = (draw > 0) ? true : false;
-+        DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
-+        InitDraw(pDC, isSplitDraw);
-+
-+        pDC->FeWork.type = DRAW;
-+        pDC->FeWork.pfnWork = GetFEDrawFunc(
-+            false,  // IsIndexed
-+            pState->tsState.tsEnable,
-+            pState->gsState.gsEnable,
-+            pState->soState.soEnable,
-+            pDC->pState->pfnProcessPrims != nullptr);
-+        pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
-+        pDC->FeWork.desc.draw.startVertex = startVertex + draw * maxVertsPerDraw;
-+        pDC->FeWork.desc.draw.numInstances = numInstances;
-+        pDC->FeWork.desc.draw.startInstance = startInstance;
-+        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
-+
-+        //enqueue DC
-+        QueueDraw(pContext);
-+
-+        remainingVerts -= numVertsForDraw;
-+        draw++;
-+    }
-+
-+    // restore culling state
-+    pDC = GetDrawContext(pContext);
-+    pDC->pState->state.rastState.cullMode = oldCullMode;
-+
-+    RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDraw
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param startVertex - Specifies start vertex in vertex buffer for draw.
-+/// @param primCount - Number of vertices.
-+void SwrDraw(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t startVertex,
-+    uint32_t numVertices)
-+{
-+    DrawInstanced(hContext, topology, numVertices, startVertex);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDrawInstanced
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
-+/// @param numInstances - How many instances to render.
-+/// @param startVertex - Specifies start vertex for draw. (vertex data)
-+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-+void SwrDrawInstanced(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numVertsPerInstance,
-+    uint32_t numInstances,
-+    uint32_t startVertex,
-+    uint32_t startInstance
-+    )
-+{
-+    DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief DrawIndexedInstanced
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numIndices - Number of indices to read sequentially from index buffer.
-+/// @param indexOffset - Starting index into index buffer.
-+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-+/// @param numInstances - Number of instances to render.
-+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-+void DrawIndexedInstance(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numIndices,
-+    uint32_t indexOffset,
-+    int32_t baseVertex,
-+    uint32_t numInstances = 1,
-+    uint32_t startInstance = 0)
-+{
-+    RDTSC_START(APIDrawIndexed);
-+
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+    API_STATE* pState = &pDC->pState->state;
-+
-+    int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
-+    uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
-+    int32_t remainingIndices = numIndices;
-+
-+    uint32_t indexSize = 0;
-+    switch (pState->indexBuffer.format)
-+    {
-+    case R32_UINT: indexSize = sizeof(uint32_t); break;
-+    case R16_UINT: indexSize = sizeof(uint16_t); break;
-+    case R8_UINT: indexSize = sizeof(uint8_t); break;
-+    default:
-+        SWR_ASSERT(0);
-+    }
-+
-+    int draw = 0;
-+    uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
-+    pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
-+
-+    pState->topology = topology;
-+    pState->forceFront = false;
-+
-+    // disable culling for points/lines
-+    uint32_t oldCullMode = pState->rastState.cullMode;
-+    if (topology == TOP_POINT_LIST)
-+    {
-+        pState->rastState.cullMode = SWR_CULLMODE_NONE;
-+        pState->forceFront = true;
-+    }
-+
-+    while (remainingIndices)
-+    {
-+        uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
-+        remainingIndices : maxIndicesPerDraw;
-+
-+        // When breaking up draw, we need to obtain new draw context for each iteration.
-+        bool isSplitDraw = (draw > 0) ? true : false;
-+        pDC = GetDrawContext(pContext, isSplitDraw);
-+        InitDraw(pDC, isSplitDraw);
-+
-+        pDC->FeWork.type = DRAW;
-+        pDC->FeWork.pfnWork = GetFEDrawFunc(
-+            true,   // IsIndexed
-+            pState->tsState.tsEnable,
-+            pState->gsState.gsEnable,
-+            pState->soState.soEnable,
-+            pDC->pState->pfnProcessPrims != nullptr);
-+        pDC->FeWork.desc.draw.pDC = pDC;
-+        pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
-+        pDC->FeWork.desc.draw.pIB = (int*)pIB;
-+        pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
-+
-+        pDC->FeWork.desc.draw.numInstances = numInstances;
-+        pDC->FeWork.desc.draw.startInstance = startInstance;
-+        pDC->FeWork.desc.draw.baseVertex = baseVertex;
-+        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
-+
-+        //enqueue DC
-+        QueueDraw(pContext);
-+
-+        pIB += maxIndicesPerDraw * indexSize;
-+        remainingIndices -= numIndicesForDraw;
-+        draw++;
-+    }
-+
-+    // restore culling state
-+    pDC = GetDrawContext(pContext);
-+    pDC->pState->state.rastState.cullMode = oldCullMode;
-+
-+    RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
-+}
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief DrawIndexed
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numIndices - Number of indices to read sequentially from index buffer.
-+/// @param indexOffset - Starting index into index buffer.
-+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-+void SwrDrawIndexed(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numIndices,
-+    uint32_t indexOffset,
-+    int32_t baseVertex
-+    )
-+{
-+    DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDrawIndexedInstanced
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numIndices - Number of indices to read sequentially from index buffer.
-+/// @param numInstances - Number of instances to render.
-+/// @param indexOffset - Starting index into index buffer.
-+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-+void SwrDrawIndexedInstanced(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numIndices,
-+    uint32_t numInstances,
-+    uint32_t indexOffset,
-+    int32_t baseVertex,
-+    uint32_t startInstance)
-+{
-+    DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
-+}
-+
-+// Attach surfaces to pipeline
-+void SwrInvalidateTiles(
-+    HANDLE hContext,
-+    uint32_t attachmentMask)
-+{
-+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+    pDC->inUse = true;
-+
-+    // Queue a load to the hottile
-+    pDC->FeWork.type = INVALIDATETILES;
-+    pDC->FeWork.pfnWork = ProcessInvalidateTiles;
-+    pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
-+
-+    //enqueue
-+    QueueDraw(pContext);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDispatch
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param threadGroupCountX - Number of thread groups dispatched in X direction
-+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
-+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-+void SwrDispatch(
-+    HANDLE hContext,
-+    uint32_t threadGroupCountX,
-+    uint32_t threadGroupCountY,
-+    uint32_t threadGroupCountZ)
-+{
-+    RDTSC_START(APIDispatch);
-+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+
-+    pDC->isCompute = true;      // This is a compute context.
-+    pDC->inUse = true;
-+
-+    COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->arena.AllocAligned(sizeof(COMPUTE_DESC), 64);
-+
-+    pTaskData->threadGroupCountX = threadGroupCountX;
-+    pTaskData->threadGroupCountY = threadGroupCountY;
-+    pTaskData->threadGroupCountZ = threadGroupCountZ;
-+
-+    uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
-+    pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
-+
-+    QueueDispatch(pContext);
-+    RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
-+}
-+
-+// Deswizzles, converts and stores current contents of the hot tiles to surface
-+// described by pState
-+void SwrStoreTiles(
-+    HANDLE hContext,
-+    SWR_RENDERTARGET_ATTACHMENT attachment,
-+    SWR_TILE_STATE postStoreTileState) // TODO: Implement postStoreTileState
-+{
-+    RDTSC_START(APIStoreTiles);
-+
-+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+    pDC->inUse = true;
-+
-+    SetupMacroTileScissors(pDC);
-+
-+    pDC->FeWork.type = STORETILES;
-+    pDC->FeWork.pfnWork = ProcessStoreTiles;
-+    pDC->FeWork.desc.storeTiles.attachment = attachment;
-+    pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
-+
-+    //enqueue
-+    QueueDraw(pContext);
-+
-+    RDTSC_STOP(APIStoreTiles, 0, 0);
-+    if (attachment == SWR_ATTACHMENT_COLOR0)
-+    {
-+        RDTSC_ENDFRAME();
-+    }
-+}
-+
-+void SwrClearRenderTarget(
-+    HANDLE hContext,
-+    uint32_t clearMask,
-+    const float clearColor[4],
-+    float z,
-+    BYTE stencil)
-+{
-+    RDTSC_START(APIClearRenderTarget);
-+
-+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-+
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+
-+    SetupMacroTileScissors(pDC);
-+
-+    pDC->inUse = true;
-+
-+    CLEAR_FLAGS flags;
-+    flags.mask = clearMask;
-+
-+    pDC->FeWork.type = CLEAR;
-+    pDC->FeWork.pfnWork = ProcessClear;
-+    pDC->FeWork.desc.clear.flags = flags;
-+    pDC->FeWork.desc.clear.clearDepth = z;
-+    pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
-+    pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
-+    pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
-+    pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
-+    pDC->FeWork.desc.clear.clearStencil = stencil;
-+
-+    // enqueue draw
-+    QueueDraw(pContext);
-+
-+    RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Returns a pointer to the private context state for the current
-+///        draw operation. This is used for external componets such as the
-+///        sampler.
-+///        SWR is responsible for the allocation of the private context state.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+VOID* SwrGetPrivateContextState(
-+    HANDLE hContext)
-+{
-+    SWR_CONTEXT* pContext = GetContext(hContext);
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+    DRAW_STATE* pState = pDC->pState;
-+
-+    if (pState->pPrivateState == nullptr)
-+    {
-+        pState->pPrivateState = pState->arena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
-+    }
-+
-+    return pState->pPrivateState;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Clients can use this to allocate memory for draw/dispatch
-+///        operations. The memory will automatically be freed once operation
-+///        has completed. Client can use this to allocate binding tables,
-+///        etc. needed for shader execution.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param size - Size of allocation
-+/// @param align - Alignment needed for allocation.
-+VOID* SwrAllocDrawContextMemory(
-+    HANDLE hContext,
-+    uint32_t size,
-+    uint32_t align)
-+{
-+    SWR_CONTEXT* pContext = GetContext(hContext);
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+
-+    return pDC->pState->arena.AllocAligned(size, align);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Returns pointer to SWR stats.
-+/// @note The counters are atomically incremented by multiple threads.
-+///       When calling this, you need to ensure all previous operations
-+///       have completed.
-+/// @todo If necessary, add a callback to avoid stalling the pipe to
-+///       sample the counters.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pStats - SWR will fill this out for caller.
-+void SwrGetStats(
-+    HANDLE hContext,
-+    SWR_STATS* pStats)
-+{
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+
-+    pDC->inUse = true;
-+
-+    pDC->FeWork.type = QUERYSTATS;
-+    pDC->FeWork.pfnWork = ProcessQueryStats;
-+    pDC->FeWork.desc.queryStats.pStats = pStats;
-+
-+    // cannot execute until all previous draws have completed
-+    pDC->dependency = pDC->drawId - 1;
-+
-+    //enqueue
-+    QueueDraw(pContext);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Enables stats counting
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param enable - If true then counts are incremented.
-+void SwrEnableStats(
-+    HANDLE hContext,
-+    bool enable)
-+{
-+    SWR_CONTEXT *pContext = GetContext(hContext);
-+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-+
-+    pDC->pState->state.enableStats = enable;
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
-new file mode 100644
-index 0000000..1741ef6
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
-@@ -0,0 +1,483 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file api.h
-+*
-+* @brief API definitions
-+*
-+******************************************************************************/
-+
-+#ifndef __SWR_API_H__
-+#define __SWR_API_H__
-+
-+#include "common/os.h"
-+
-+#include <assert.h>
-+#include <vector>
-+
-+#include "common/simdintrin.h"
-+#include "common/formats.h"
-+#include "core/utils.h"
-+#include "core/state.h"
-+
-+///@todo place all the API functions into the 'swr' namespace.
-+
-+typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Function signature for load hot tiles
-+/// @param hPrivateContext - handle to private data
-+/// @param dstFormat - format of the hot tile
-+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-+/// @param x - destination x coordinate
-+/// @param y - destination y coordinate
-+/// @param pDstHotTile - pointer to the hot tile surface
-+typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Function signature for store hot tiles
-+/// @param hPrivateContext - handle to private data
-+/// @param srcFormat - format of the hot tile
-+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-+/// @param x - destination x coordinate
-+/// @param y - destination y coordinate
-+/// @param pSrcHotTile - pointer to the hot tile surface
-+typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
-+
-+/// @brief Function signature for clearing from the hot tiles clear value
-+/// @param hPrivateContext - handle to private data
-+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
-+/// @param x - destination x coordinate
-+/// @param y - destination y coordinate
-+/// @param pClearColor - pointer to the hot tile's clear value
-+typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
-+    SWR_RENDERTARGET_ATTACHMENT rtIndex,
-+    uint32_t x, uint32_t y, const float* pClearColor);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_CREATECONTEXT_INFO
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_CREATECONTEXT_INFO
-+{
-+    DRIVER_TYPE driver;
-+
-+    // External functions (e.g. sampler) need per draw context state.
-+    // Use SwrGetPrivateContextState() to access private state.
-+    uint32_t privateStateSize;
-+
-+    // tile manipulation functions
-+    PFN_LOAD_TILE pfnLoadTile;
-+    PFN_STORE_TILE pfnStoreTile;
-+    PFN_CLEAR_TILE pfnClearTile;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_RECT
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_RECT
-+{
-+    uint32_t left;
-+    uint32_t right;
-+    uint32_t top;
-+    uint32_t bottom;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Create SWR Context.
-+/// @param pCreateInfo - pointer to creation info.
-+HANDLE SWR_API SwrCreateContext(
-+    const SWR_CREATECONTEXT_INFO* pCreateInfo);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Destroys SWR Context.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+void SWR_API SwrDestroyContext(
-+    HANDLE hContext);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
-+///        has been completed
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pfnFunc - pointer to callback function,
-+/// @param userData - user data to pass back 
-+void SWR_API SwrSync(
-+    HANDLE hContext,
-+    PFN_CALLBACK_FUNC pfnFunc,
-+    uint64_t userData,
-+    uint64_t userData2);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Blocks until all rendering has been completed.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+void SWR_API SwrWaitForIdle(
-+    HANDLE hContext);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set vertex buffer state.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param numBuffers - Number of vertex buffer state descriptors.
-+/// @param pVertexBuffers - Array of vertex buffer state descriptors.
-+void SWR_API SwrSetVertexBuffers(
-+    HANDLE hContext,
-+    uint32_t numBuffers,
-+    const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set index buffer
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pIndexBuffer - Index buffer.
-+void SWR_API SwrSetIndexBuffer(
-+    HANDLE hContext,
-+    const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set fetch shader pointer.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pfnFetchFunc - Pointer to shader.
-+void SWR_API SwrSetFetchFunc(
-+    HANDLE hContext,
-+    PFN_FETCH_FUNC    pfnFetchFunc);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set streamout shader pointer.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pfnSoFunc - Pointer to shader.
-+/// @param streamIndex - specifies stream
-+void SWR_API SwrSetSoFunc(
-+    HANDLE hContext,
-+    PFN_SO_FUNC    pfnSoFunc,
-+    uint32_t streamIndex);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set streamout state
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pSoState - Pointer to streamout state.
-+void SWR_API SwrSetSoState(
-+    HANDLE hContext,
-+    SWR_STREAMOUT_STATE* pSoState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set streamout buffer state
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pSoBuffer - Pointer to streamout buffer.
-+/// @param slot - Slot to bind SO buffer to.
-+void SWR_API SwrSetSoBuffers(
-+    HANDLE hContext,
-+    SWR_STREAMOUT_BUFFER* pSoBuffer,
-+    uint32_t slot);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set vertex shader pointer.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pfnVertexFunc - Pointer to shader.
-+void SWR_API SwrSetVertexFunc(
-+    HANDLE hContext,
-+    PFN_VERTEX_FUNC pfnVertexFunc);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set frontend state.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to state
-+void SWR_API SwrSetFrontendState(
-+    HANDLE hContext,
-+    SWR_FRONTEND_STATE *pState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set geometry shader state.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to state
-+void SWR_API SwrSetGsState(
-+    HANDLE hContext,
-+    SWR_GS_STATE *pState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set geometry shader
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to geometry shader function
-+void SWR_API SwrSetGsFunc(
-+    HANDLE hContext,
-+    PFN_GS_FUNC pfnGsFunc);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set compute shader
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to compute shader function
-+/// @param totalThreadsInGroup - product of thread group dimensions.
-+void SWR_API SwrSetCsFunc(
-+    HANDLE hContext,
-+    PFN_CS_FUNC pfnCsFunc,
-+    uint32_t totalThreadsInGroup);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set tessellation state.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to state
-+void SWR_API SwrSetTsState(
-+    HANDLE hContext,
-+    SWR_TS_STATE *pState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set hull shader
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pfnFunc - Pointer to shader function
-+void SWR_API SwrSetHsFunc(
-+    HANDLE hContext,
-+    PFN_HS_FUNC pfnFunc);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set domain shader
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pfnFunc - Pointer to shader function
-+void SWR_API SwrSetDsFunc(
-+    HANDLE hContext,
-+    PFN_DS_FUNC pfnFunc);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set depth stencil state
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to state.
-+void SWR_API SwrSetDepthStencilState(
-+    HANDLE hContext,
-+    SWR_DEPTH_STENCIL_STATE *pState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set backend state
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to state.
-+void SWR_API SwrSetBackendState(
-+    HANDLE hContext,
-+    SWR_BACKEND_STATE *pState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set pixel shader state
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to state.
-+void SWR_API SwrSetPixelShaderState(
-+    HANDLE hContext,
-+    SWR_PS_STATE *pState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set blend state
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pState - Pointer to state.
-+void SWR_API SwrSetBlendState(
-+    HANDLE hContext,
-+    SWR_BLEND_STATE *pState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set blend function
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param renderTarget - render target index
-+/// @param pfnBlendFunc - function pointer
-+void SWR_API SwrSetBlendFunc(
-+    HANDLE hContext,
-+    uint32_t renderTarget,
-+    PFN_BLEND_JIT_FUNC pfnBlendFunc);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Set linkage mask
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param mask - Specifies which vertex outputs are are needed by PS.
-+/// @param pMap - (Optional)Linkage map to specify where FE attributes are
-+///               gathered from to supply PS attribute values.  The length
-+///               of the map buffer needs to match the number of set bits
-+///               in "mask".
-+void SWR_API SwrSetLinkage(
-+    HANDLE hContext,
-+    uint32_t mask,
-+    const uint8_t* pMap);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDraw
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param startVertex - Specifies start vertex in vertex buffer for draw.
-+/// @param primCount - Number of vertices.
-+void SWR_API SwrDraw(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t startVertex,
-+    uint32_t primCount);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDrawInstanced
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
-+/// @param numInstances - How many instances to render.
-+/// @param startVertex - Specifies start vertex for draw. (vertex data)
-+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-+void SWR_API SwrDrawInstanced(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numVertsPerInstance,
-+    uint32_t numInstances,
-+    uint32_t startVertex,
-+    uint32_t startInstance);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief DrawIndexed
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numIndices - Number of indices to read sequentially from index buffer.
-+/// @param indexOffset - Starting index into index buffer.
-+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-+void SWR_API SwrDrawIndexed(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numIndices,
-+    uint32_t indexOffset,
-+    int32_t baseVertex);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDrawIndexedInstanced
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param topology - Specifies topology for draw.
-+/// @param numIndices - Number of indices to read sequentially from index buffer.
-+/// @param numInstances - Number of instances to render.
-+/// @param indexOffset - Starting index into index buffer.
-+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-+void SWR_API SwrDrawIndexedInstanced(
-+    HANDLE hContext,
-+    PRIMITIVE_TOPOLOGY topology,
-+    uint32_t numIndices,
-+    uint32_t numInstances,
-+    uint32_t indexOffset,
-+    int32_t baseVertex,
-+    uint32_t startInstance);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrInvalidateTiles
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
-+void SWR_API SwrInvalidateTiles(
-+    HANDLE hContext,
-+    uint32_t attachmentMask);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrDispatch
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param threadGroupCountX - Number of thread groups dispatched in X direction
-+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
-+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-+void SWR_API SwrDispatch(
-+    HANDLE hContext,
-+    uint32_t threadGroupCountX,
-+    uint32_t threadGroupCountY,
-+    uint32_t threadGroupCountZ);
-+
-+
-+enum SWR_TILE_STATE
-+{
-+    SWR_TILE_INVALID    = 0,    // tile is in unitialized state and should be loaded with surface contents before rendering
-+    SWR_TILE_DIRTY      = 2,    // tile contains newer data than surface it represents
-+    SWR_TILE_RESOLVED   = 3,    // is in sync with surface it represents
-+};
-+
-+/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs.
-+void SWR_API SwrStoreTiles(
-+    HANDLE hContext,
-+    SWR_RENDERTARGET_ATTACHMENT attachment,
-+    SWR_TILE_STATE postStoreTileState);
-+
-+void SWR_API SwrClearRenderTarget(
-+    HANDLE hContext,
-+    uint32_t clearMask,
-+    const FLOAT clearColor[4],
-+    float z,
-+    BYTE stencil);
-+
-+void SWR_API SwrSetRastState(
-+    HANDLE hContext,
-+    const SWR_RASTSTATE *pRastState);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrSetViewports
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param numViewports - number of viewports passed in
-+/// @param pViewports - Specifies extents of viewport.
-+/// @param pMatrices - If not specified then SWR computes a default one.
-+void SWR_API SwrSetViewports(
-+    HANDLE hContext,
-+    uint32_t numViewports,
-+    const SWR_VIEWPORT* pViewports,
-+    const SWR_VIEWPORT_MATRIX* pMatrices);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SwrSetScissorRects
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param numScissors - number of scissors passed in
-+/// @param pScissors - array of scissors
-+void SWR_API SwrSetScissorRects(
-+    HANDLE hContext,
-+    uint32_t numScissors,
-+    const BBOX* pScissors);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Returns a pointer to the private context state for the current
-+///        draw operation. This is used for external componets such as the
-+///        sampler.
-+///
-+/// @note  Client needs to resend private state prior to each draw call.
-+///        Also, SWR is responsible for the private state memory.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+VOID* SWR_API SwrGetPrivateContextState(
-+    HANDLE hContext);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Clients can use this to allocate memory for draw/dispatch
-+///        operations. The memory will automatically be freed once operation
-+///        has completed. Client can use this to allocate binding tables,
-+///        etc. needed for shader execution.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param size - Size of allocation
-+/// @param align - Alignment needed for allocation.
-+VOID* SWR_API SwrAllocDrawContextMemory(
-+    HANDLE hContext,
-+    uint32_t size,
-+    uint32_t align);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Returns pointer to SWR stats.
-+/// @note The counters are incremented by multiple threads.
-+///       When calling this, you need to ensure all previous operations
-+///       have completed.
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param pStats - SWR will fill this out for caller.
-+void SWR_API SwrGetStats(
-+    HANDLE hContext,
-+    SWR_STATS* pStats);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Enables stats counting
-+/// @param hContext - Handle passed back from SwrCreateContext
-+/// @param enable - If true then counts are incremented.
-+void SWR_API SwrEnableStats(
-+    HANDLE hContext,
-+    bool enable);
-+
-+#endif//__SWR_API_H__
-diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
-new file mode 100644
-index 0000000..bc4cfd8
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
-@@ -0,0 +1,126 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file arena.cpp
-+*
-+* @brief Arena memory manager
-+*        The arena is convenient and fast for managing allocations for any of
-+*        our allocations that are associated with operations and can all be freed
-+*        once when their operation has completed. Allocations are cheap since
-+*        most of the time its simply an increment of an offset. Also, no need to
-+*        free individual allocations. All of the arena memory can be freed at once.
-+*
-+******************************************************************************/
-+
-+#include "context.h"
-+#include "arena.h"
-+
-+#include <cmath>
-+
-+VOID Arena::Init()
-+{
-+    m_memUsed = 0;
-+    m_pCurBlock = nullptr;
-+    m_pUsedBlocks = nullptr;
-+}
-+
-+VOID* Arena::AllocAligned(uint32_t size, uint32_t align)
-+{
-+    if (m_pCurBlock)
-+    {
-+        ArenaBlock* pCurBlock = m_pCurBlock;
-+        pCurBlock->offset = AlignUp(pCurBlock->offset, align);
-+
-+        if ((pCurBlock->offset + size) < pCurBlock->blockSize)
-+        {
-+            BYTE* pMem = (BYTE*)pCurBlock->pMem + pCurBlock->offset;
-+            pCurBlock->offset += size;
-+            return pMem;
-+        }
-+
-+        // Not enough memory in this arena so lets move to a new block.
-+        pCurBlock->pNext = m_pUsedBlocks;
-+        m_pUsedBlocks = pCurBlock;
-+        m_pCurBlock = nullptr;
-+    }
-+
-+    static const uint32_t ArenaBlockSize = 1024*1024;
-+    uint32_t defaultBlockSize = ArenaBlockSize;
-+    if (m_pUsedBlocks == nullptr)
-+    {
-+        // First allocation after reset. Let's make the first block be the total
-+        // memory allocated during last set of allocations prior to reset.
-+        defaultBlockSize = std::max(m_memUsed, defaultBlockSize);
-+        m_memUsed = 0;
-+    }
-+
-+    uint32_t blockSize = std::max(size, defaultBlockSize);
-+    blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
-+
-+    VOID *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4);    // Arena blocks are always simd byte aligned.
-+    SWR_ASSERT(pMem != nullptr);
-+
-+    m_pCurBlock = (ArenaBlock*)malloc(sizeof(ArenaBlock));
-+    SWR_ASSERT(m_pCurBlock != nullptr);
-+
-+    if (m_pCurBlock != nullptr)
-+    {
-+        m_pCurBlock->pMem = pMem;
-+        m_pCurBlock->blockSize = blockSize;
-+        m_pCurBlock->offset = size;
-+        m_memUsed += blockSize;
-+    }
-+
-+    return pMem;
-+}
-+
-+VOID* Arena::Alloc(uint32_t size)
-+{
-+    return AllocAligned(size, 1);
-+}
-+
-+VOID Arena::Reset()
-+{
-+    if (m_pCurBlock)
-+    {
-+        m_pCurBlock->offset = 0;
-+
-+        // If we needed to allocate used blocks then reset current.
-+        // The next time we allocate we'll grow the current block
-+        // to match all the memory allocated this for this frame.
-+        if (m_pUsedBlocks)
-+        {
-+            m_pCurBlock->pNext = m_pUsedBlocks;
-+            m_pUsedBlocks = m_pCurBlock;
-+            m_pCurBlock = nullptr;
-+        }
-+    }
-+
-+    while(m_pUsedBlocks)
-+    {
-+        ArenaBlock* pBlock = m_pUsedBlocks;
-+        m_pUsedBlocks = pBlock->pNext;
-+
-+        _aligned_free(pBlock->pMem);
-+        free(pBlock);
-+    }
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
-new file mode 100644
-index 0000000..e98bc83
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
-@@ -0,0 +1,63 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file arena.h
-+*
-+* @brief Arena memory manager
-+*        The arena is convenient and fast for managing allocations for any of
-+*        our allocations that are associated with operations and can all be freed
-+*        once when their operation has completed. Allocations are cheap since
-+*        most of the time its simply an increment of an offset. Also, no need to
-+*        free individual allocations. All of the arena memory can be freed at once.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+class Arena
-+{
-+public:
-+    Arena() : m_pCurBlock(nullptr), m_pUsedBlocks(nullptr), m_memUsed(0) { }
-+    ~Arena() { }
-+
-+    VOID    Init();
-+
-+    VOID*   AllocAligned(uint32_t  size, uint32_t  align);
-+    VOID*   Alloc(uint32_t  size);
-+    VOID    Reset();
-+
-+private:
-+
-+    struct ArenaBlock
-+    {
-+        ArenaBlock() : pMem(nullptr), blockSize(0), pNext(nullptr) {}
-+
-+        VOID        *pMem;
-+        uint32_t    blockSize;
-+        uint32_t    offset;
-+        ArenaBlock *pNext;
-+    };
-+
-+    ArenaBlock      *m_pCurBlock;
-+    ArenaBlock      *m_pUsedBlocks;
-+
-+    uint32_t        m_memUsed;      // total bytes allocated since last reset.
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
-new file mode 100644
-index 0000000..9cf2b00
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
-@@ -0,0 +1,1150 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file backend.cpp
-+*
-+* @brief Backend handles rasterization, pixel shading and output merger
-+*        operations.
-+*
-+******************************************************************************/
-+
-+#include <smmintrin.h>
-+
-+#include "rdtsc_core.h"
-+#include "backend.h"
-+#include "depthstencil.h"
-+#include "tilemgr.h"
-+#include "memory/tilingtraits.h"
-+#include "core/multisample.h"
-+
-+#include <algorithm>
-+
-+const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
-+const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
-+
-+/// @todo move to common lib
-+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
-+static const __m128 gMaskToVec[] = {
-+    MASKTOVEC(0,0,0,0),
-+    MASKTOVEC(0,0,0,1),
-+    MASKTOVEC(0,0,1,0),
-+    MASKTOVEC(0,0,1,1),
-+    MASKTOVEC(0,1,0,0),
-+    MASKTOVEC(0,1,0,1),
-+    MASKTOVEC(0,1,1,0),
-+    MASKTOVEC(0,1,1,1),
-+    MASKTOVEC(1,0,0,0),
-+    MASKTOVEC(1,0,0,1),
-+    MASKTOVEC(1,0,1,0),
-+    MASKTOVEC(1,0,1,1),
-+    MASKTOVEC(1,1,0,0),
-+    MASKTOVEC(1,1,0,1),
-+    MASKTOVEC(1,1,1,0),
-+    MASKTOVEC(1,1,1,1),
-+};
-+
-+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
-+static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Process compute work.
-+/// @param pDC - pointer to draw context (dispatch).
-+/// @param workerId - The unique worker ID that is assigned to this thread.
-+/// @param threadGroupId - the linear index for the thread group within the dispatch.
-+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
-+{
-+    RDTSC_START(BEDispatch);
-+
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+
-+    const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
-+    SWR_ASSERT(pTaskData != nullptr);
-+
-+    const API_STATE& state = GetApiState(pDC);
-+
-+    SWR_CS_CONTEXT csContext{ 0 };
-+    csContext.tileCounter = threadGroupId;
-+    csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
-+    csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
-+    csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
-+    csContext.pTGSM = pContext->pScratch[workerId];
-+
-+    state.pfnCsFunc(GetPrivateState(pDC), &csContext);
-+
-+    UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
-+
-+    RDTSC_STOP(BEDispatch, 1, 0);
-+}
-+
-+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
-+{
-+    SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
-+
-+    uint32_t x, y;
-+    MacroTileMgr::getTileIndices(macroTile, x, y);
-+    SWR_ASSERT(x == 0 && y == 0);
-+
-+    if (pSync->pfnCallbackFunc != nullptr)
-+    {
-+        pSync->pfnCallbackFunc(pSync->userData, pSync->userData2);
-+    }
-+}
-+
-+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
-+{
-+    QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
-+    SWR_STATS* pStats = pQueryDesc->pStats;
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+
-+    SWR_ASSERT(pStats != nullptr);
-+
-+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-+    {
-+        pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
-+
-+        pStats->IaVertices    += pContext->stats[i].IaVertices;
-+        pStats->IaPrimitives  += pContext->stats[i].IaPrimitives;
-+        pStats->VsInvocations += pContext->stats[i].VsInvocations;
-+        pStats->HsInvocations += pContext->stats[i].HsInvocations;
-+        pStats->DsInvocations += pContext->stats[i].DsInvocations;
-+        pStats->GsInvocations += pContext->stats[i].GsInvocations;
-+        pStats->PsInvocations += pContext->stats[i].PsInvocations;
-+        pStats->CInvocations  += pContext->stats[i].CInvocations;
-+        pStats->CsInvocations += pContext->stats[i].CsInvocations;
-+        pStats->CPrimitives   += pContext->stats[i].CPrimitives;
-+        pStats->GsPrimitives  += pContext->stats[i].GsPrimitives;
-+
-+        for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
-+        {
-+            pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
-+
-+            /// @note client is required to provide valid write offset before every draw, so we clear
-+            /// out the contents of the write offset when storing stats
-+            pContext->stats[i].SoWriteOffset[stream] = 0;
-+
-+            pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
-+            pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
-+        }
-+    }
-+}
-+
-+template<SWR_FORMAT format>
-+void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
-+{
-+    auto lambda = [&](int comp)
-+    {
-+        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-+        pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-+    };
-+
-+    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
-+    for (uint32_t i = 0; i < numIter; ++i)
-+    {
-+        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-+    }
-+}
-+
-+template<SWR_FORMAT format>
-+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
-+{
-+    // convert clear color to hottile format
-+    // clear color is in RGBA float/uint32
-+    simdvector vClear;
-+    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-+    {
-+        simdscalar vComp;
-+        vComp = _simd_load1_ps((const float*)&clear[comp]);
-+        if (FormatTraits<format>::isNormalized(comp))
-+        {
-+            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
-+            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
-+        }
-+        vComp = FormatTraits<format>::pack(comp, vComp);
-+        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-+    }
-+
-+    uint32_t tileX, tileY;
-+    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
-+    const API_STATE& state = GetApiState(pDC);
-+    
-+    int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
-+    int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-+    int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
-+    int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
-+
-+    // intersect with scissor
-+    top = std::max(top, state.scissorInFixedPoint.top);
-+    left = std::max(left, state.scissorInFixedPoint.left);
-+    bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
-+    right = std::min(right, state.scissorInFixedPoint.right);
-+
-+    // translate to local hottile origin
-+    top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
-+    bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
-+    left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
-+    right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
-+
-+    // convert to raster tiles
-+    top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-+
-+    const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-+    // compute steps between raster tile samples / raster tiles / macro tile rows
-+    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
-+    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
-+    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
-+    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
-+
-+    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
-+    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
-+    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
-+
-+    // loop over all raster tiles in the current hot tile
-+    for (int y = top; y <= bottom; ++y)
-+    {
-+        uint8_t* pRasterTile = pRasterTileRow;
-+        for (int x = left; x <= right; ++x)
-+        {
-+            for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
-+            {
-+                ClearRasterTile<format>(pRasterTile, vClear);
-+                pRasterTile += rasterTileSampleStep;
-+            }
-+        }
-+        pRasterTileRow += macroTileRowStep;
-+    }
-+
-+    pHotTile->state = HOTTILE_DIRTY;
-+}
-+
-+
-+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
-+{
-+    if (KNOB_FAST_CLEAR)
-+    {
-+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-+        SWR_CONTEXT *pContext = pDC->pContext;
-+        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
-+        uint32_t numSamples = GetNumSamples(sampleCount);
-+
-+        SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
-+
-+        RDTSC_START(BEClear);
-+
-+        if (pClear->flags.mask & SWR_CLEAR_COLOR)
-+        {
-+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
-+            // All we want to do here is to mark the hot tile as being in a "needs clear" state.
-+            pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
-+            pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
-+            pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
-+            pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-+            pHotTile->state = HOTTILE_CLEAR;
-+        }
-+
-+        if (pClear->flags.mask & SWR_CLEAR_DEPTH)
-+        {
-+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
-+            pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
-+            pHotTile->state = HOTTILE_CLEAR;
-+        }
-+
-+        if (pClear->flags.mask & SWR_CLEAR_STENCIL)
-+        {
-+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
-+
-+            pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
-+            pHotTile->state = HOTTILE_CLEAR;
-+        }
-+
-+        RDTSC_STOP(BEClear, 0, 0);
-+    }
-+    else
-+    {
-+        // Legacy clear
-+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-+        RDTSC_START(BEClear);
-+
-+        if (pClear->flags.mask & SWR_CLEAR_COLOR)
-+        {
-+            /// @todo clear data should come in as RGBA32_FLOAT
-+            DWORD clearData[4];
-+            float clearFloat[4];
-+            clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
-+            clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
-+            clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
-+            clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
-+            clearData[0] = *(DWORD*)&clearFloat[0];
-+            clearData[1] = *(DWORD*)&clearFloat[1];
-+            clearData[2] = *(DWORD*)&clearFloat[2];
-+            clearData[3] = *(DWORD*)&clearFloat[3];
-+
-+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
-+            SWR_ASSERT(pfnClearTiles != nullptr);
-+
-+            pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
-+        }
-+
-+        if (pClear->flags.mask & SWR_CLEAR_DEPTH)
-+        {
-+            DWORD clearData[4];
-+            clearData[0] = *(DWORD*)&pClear->clearDepth;
-+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
-+            SWR_ASSERT(pfnClearTiles != nullptr);
-+
-+            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
-+        }
-+
-+        if (pClear->flags.mask & SWR_CLEAR_STENCIL)
-+        {
-+            uint32_t value = pClear->clearStencil;
-+            DWORD clearData[4];
-+            clearData[0] = *(DWORD*)&value;
-+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
-+
-+            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
-+        }
-+
-+        RDTSC_STOP(BEClear, 0, 0);
-+    }
-+}
-+
-+
-+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
-+{
-+    RDTSC_START(BEStoreTiles);
-+    STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+
-+#ifdef KNOB_ENABLE_RDTSC
-+    uint32_t numTiles = 0;
-+#endif
-+    SWR_FORMAT srcFormat;
-+    switch (pDesc->attachment)
-+    {
-+    case SWR_ATTACHMENT_COLOR0:
-+    case SWR_ATTACHMENT_COLOR1:
-+    case SWR_ATTACHMENT_COLOR2:
-+    case SWR_ATTACHMENT_COLOR3:
-+    case SWR_ATTACHMENT_COLOR4:
-+    case SWR_ATTACHMENT_COLOR5:
-+    case SWR_ATTACHMENT_COLOR6:
-+    case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
-+    case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-+    case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-+    default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
-+    }
-+
-+    uint32_t x, y;
-+    MacroTileMgr::getTileIndices(macroTile, x, y);
-+
-+    // Only need to store the hottile if it's been rendered to...
-+    HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
-+    if (pHotTile)
-+    {
-+        // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
-+        if (pHotTile->state == HOTTILE_CLEAR)
-+        {
-+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
-+            SWR_ASSERT(pfnClearTiles != nullptr);
-+
-+            pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
-+        }
-+
-+        if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
-+        {
-+            int destX = KNOB_MACROTILE_X_DIM * x;
-+            int destY = KNOB_MACROTILE_Y_DIM * y;
-+
-+            pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
-+                pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-+        }
-+        
-+
-+        if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
-+        {
-+            pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
-+        }
-+    }
-+    RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
-+}
-+
-+
-+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
-+{
-+    INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+
-+    for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
-+    {
-+        if (pDesc->attachmentMask & (1 << i))
-+        {
-+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
-+            if (pHotTile)
-+            {
-+                pHotTile->state = HOTTILE_INVALID;
-+            }
-+        }
-+    }
-+}
-+
-+#if KNOB_SIMD_WIDTH == 8
-+const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-+const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-+const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-+const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-+#define MASK 0xff
-+#else
-+#error Unsupported vector width
-+#endif
-+
-+INLINE
-+bool CanEarlyZ(const SWR_PS_STATE *pPSState)
-+{
-+    return (!pPSState->writesODepth && !pPSState->usesSourceDepth);
-+}
-+
-+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
-+{
-+    simdscalar vClipMask = _simd_setzero_ps();
-+    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
-+
-+    for (uint32_t i = 0; i < numClipDistance; ++i)
-+    {
-+        // pull triangle clip distance values from clip buffer
-+        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
-+        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
-+        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
-+
-+        // interpolate
-+        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
-+        
-+        // clip if interpolated clip distance is < 0 || NAN
-+        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
-+
-+        vClipMask = _simd_or_ps(vClipMask, vCull);
-+    }
-+
-+    return _simd_movemask_ps(vClipMask);
-+}
-+
-+template<uint32_t MaxRT, SWR_MULTISAMPLE_COUNT sampleCount>
-+void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-+{
-+    RDTSC_START(BESetup);
-+
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_RASTSTATE& rastState = state.rastState;
-+    const SWR_PS_STATE *pPSState = &state.psState;
-+    const SWR_BLEND_STATE *pBlendState = &state.blendState;
-+
-+    // broadcast scalars
-+    simdscalar vIa = _simd_broadcast_ss(&work.I[0]);
-+    simdscalar vIb = _simd_broadcast_ss(&work.I[1]);
-+    simdscalar vIc = _simd_broadcast_ss(&work.I[2]);
-+
-+    simdscalar vJa = _simd_broadcast_ss(&work.J[0]);
-+    simdscalar vJb = _simd_broadcast_ss(&work.J[1]);
-+    simdscalar vJc = _simd_broadcast_ss(&work.J[2]);
-+
-+    simdscalar vZa = _simd_broadcast_ss(&work.Z[0]);
-+    simdscalar vZb = _simd_broadcast_ss(&work.Z[1]);
-+    simdscalar vZc = _simd_broadcast_ss(&work.Z[2]);
-+
-+    simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet);
-+
-+    simdscalar vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-+    simdscalar vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-+    simdscalar vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
-+
-+    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
-+    for(uint32_t rt = 0; rt <= MaxRT; ++rt)
-+    {
-+        pColorBase[rt] = renderBuffers.pColor[rt];
-+    }
-+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
-+    RDTSC_STOP(BESetup, 0, 0);
-+
-+    SWR_PS_CONTEXT psContext;
-+    psContext.pAttribs = work.pAttribs;
-+    psContext.pPerspAttribs = work.pPerspAttribs;
-+    psContext.frontFace = work.triFlags.frontFacing;
-+    psContext.primID = work.triFlags.primID;
-+
-+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
-+    psContext.I = work.I;
-+    psContext.J = work.J;
-+    psContext.recipDet = work.recipDet;
-+    psContext.pSamplePos = work.pSamplePos;
-+    const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
-+
-+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-+    {
-+        simdscalar vYSamplePosUL;
-+        if(sampleCount == SWR_MULTISAMPLE_1X)
-+        {
-+            // pixel center
-+            psContext.vY = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
-+        }
-+        else
-+        {
-+            // UL pixel corner
-+            vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
-+        }
-+
-+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-+        {
-+            simdscalar vXSamplePosUL;
-+            if(sampleCount > SWR_MULTISAMPLE_1X)
-+            {
-+                // UL pixel corner
-+                vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
-+            }
-+
-+            // @todo: uint32_t sampleMask = state.rastState.sampleMask & MultisampleTraits<sampleCount>::sampleMask;
-+            for(uint32_t sample = 0; sample < numSamples; sample++)
-+            {
-+                /// @todo: sampleMask / inputcoverage
-+                if (work.coverageMask[sample] & MASK)
-+                {
-+                    RDTSC_START(BEBarycentric);
-+
-+                    if(sampleCount == SWR_MULTISAMPLE_1X)
-+                    {
-+                        // pixel center
-+                        psContext.vX = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
-+                    }
-+                    else
-+                    {
-+                        // calculate per sample positions
-+                        psContext.vX = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
-+                        psContext.vY = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
-+                    }
-+
-+                    // evaluate I,J
-+                    psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY);
-+                    psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY);
-+                    psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet);
-+                    psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet);
-+
-+                    // interpolate z
-+                    psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ);
-+                    RDTSC_STOP(BEBarycentric, 0, 0);
-+
-+                    simdmask coverageMask = work.coverageMask[sample] & MASK;
-+
-+                    // interpolate user clip distance if available
-+                    if (rastState.clipDistanceMask)
-+                    {
-+                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
-+                            psContext.vI, psContext.vJ);
-+                    }
-+
-+                    simdscalar depthPassMask = vMask(coverageMask);
-+
-+                    uint8_t *pDepthSample, *pStencilSample;
-+                    if(sampleCount == SWR_MULTISAMPLE_1X)
-+                    {
-+                        pDepthSample = pDepthBase;
-+                        pStencilSample = pStencilBase;
-+                    }
-+                    else
-+                    {
-+                        // offset depth/stencil buffers current sample
-+                        pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-+                        pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
-+                    }
-+
-+                    // Early-Z?
-+                    if (CanEarlyZ(pPSState))
-+                    {
-+                        RDTSC_START(BEEarlyDepthTest);
-+                        depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
-+                                              psContext.vZ, pDepthBase, depthPassMask, pStencilBase, pPSState->killsPixel);
-+                        RDTSC_STOP(BEEarlyDepthTest, 0, 0);
-+
-+                        if (!_simd_movemask_ps(depthPassMask))
-+                        {
-+                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-+                            continue;
-+                        }
-+                    }
-+
-+                    // interpolate 1/w
-+                    psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ);
-+                    psContext.sampleIndex = sample;
-+                    psContext.mask = _simd_castps_si(depthPassMask);
-+
-+                    // execute pixel shader
-+                    RDTSC_START(BEPixelShader);
-+                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-+                    RDTSC_STOP(BEPixelShader, 0, 0);
-+
-+                    depthPassMask = _simd_castsi_ps(psContext.mask);
-+
-+                    //// late-Z
-+                    if (!CanEarlyZ(pPSState) || pPSState->killsPixel)
-+                    {
-+                        RDTSC_START(BELateDepthTest);
-+                        depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
-+                                              psContext.vZ, pDepthSample, depthPassMask, pStencilSample, false);
-+                        RDTSC_STOP(BELateDepthTest, 0, 0);
-+
-+                        if (!_simd_movemask_ps(depthPassMask))
-+                        {
-+                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-+                            continue;
-+                        }
-+                    }
-+
-+                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
-+                    uint32_t statCount = _mm_popcnt_u32(statMask);
-+                    UPDATE_STAT(DepthPassCount, statCount);
-+
-+                    simdscalari mask = _simd_castps_si(depthPassMask);
-+
-+                    // output merger
-+                    RDTSC_START(BEOutputMerger);
-+
-+                    if(sampleCount != SWR_MULTISAMPLE_1X)
-+                    {
-+                        if(rastState.isSampleMasked[sample])
-+                        {
-+                            continue;
-+                        }
-+                    }
-+
-+                    uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
-+                    for (uint32_t rt = 0; rt <= MaxRT; ++rt)
-+                    {
-+                        uint8_t *pColorSample;
-+                        if(sampleCount == SWR_MULTISAMPLE_1X)
-+                        {
-+                            pColorSample = pColorBase[rt];
-+                        }
-+                        else
-+                        {
-+                            pColorSample = pColorBase[rt] + rasterTileColorOffset;
-+                        }
-+
-+                        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-+
-+                        // Blend outputs
-+                        if (pRTBlend->colorBlendEnable)
-+                        {
-+                            state.pfnBlendFunc[rt](pBlendState, psContext.shaded[rt], psContext.shaded[1], pColorSample, psContext.shaded[rt]);
-+                        }
-+
-+                        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-+                        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-+
-+                        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-+
-+                        // store with color mask
-+                        if (!pRTBlend->writeDisableRed)
-+                        {
-+                            _simd_maskstore_ps((float*)pColorSample, mask, psContext.shaded[rt].x);
-+                        }
-+                        if (!pRTBlend->writeDisableGreen)
-+                        {
-+                            _simd_maskstore_ps((float*)(pColorSample + simd), mask, psContext.shaded[rt].y);
-+                        }
-+                        if (!pRTBlend->writeDisableBlue)
-+                        {
-+                            _simd_maskstore_ps((float*)(pColorSample + simd * 2), mask, psContext.shaded[rt].z);
-+                        }
-+                        if (!pRTBlend->writeDisableAlpha)
-+                        {
-+                            _simd_maskstore_ps((float*)(pColorSample + simd * 3), mask, psContext.shaded[rt].w);
-+                        }
-+                    }
-+
-+                    RDTSC_STOP(BEOutputMerger, 0, 0);
-+                }
-+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-+            }
-+            RDTSC_START(BEEndTile);
-+            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-+            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-+
-+            for (uint32_t rt = 0; rt <= MaxRT; ++rt)
-+            {
-+                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-+            }
-+            RDTSC_STOP(BEEndTile, 0, 0);
-+        }
-+    }
-+}
-+
-+template<uint32_t MaxRT, SWR_MULTISAMPLE_COUNT sampleCount>
-+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-+{
-+    RDTSC_START(BESetup);
-+
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_RASTSTATE& rastState = state.rastState;
-+    const SWR_PS_STATE *pPSState = &state.psState;
-+    const SWR_BLEND_STATE *pBlendState = &state.blendState;
-+
-+    // broadcast scalars
-+    simdscalar vIa = _simd_broadcast_ss(&work.I[0]);
-+    simdscalar vIb = _simd_broadcast_ss(&work.I[1]);
-+    simdscalar vIc = _simd_broadcast_ss(&work.I[2]);
-+
-+    simdscalar vJa = _simd_broadcast_ss(&work.J[0]);
-+    simdscalar vJb = _simd_broadcast_ss(&work.J[1]);
-+    simdscalar vJc = _simd_broadcast_ss(&work.J[2]);
-+
-+    simdscalar vZa = _simd_broadcast_ss(&work.Z[0]);
-+    simdscalar vZb = _simd_broadcast_ss(&work.Z[1]);
-+    simdscalar vZc = _simd_broadcast_ss(&work.Z[2]);
-+
-+    simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet);
-+
-+    simdscalar vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-+    simdscalar vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-+    simdscalar vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
-+
-+    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
-+    for(uint32_t rt = 0; rt <= MaxRT; ++rt)
-+    {
-+        pColorBase[rt] = renderBuffers.pColor[rt];
-+    }
-+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
-+    RDTSC_STOP(BESetup, 0, 0);
-+
-+    SWR_PS_CONTEXT psContext;
-+    psContext.pAttribs = work.pAttribs;
-+    psContext.pPerspAttribs = work.pPerspAttribs;
-+    psContext.frontFace = work.triFlags.frontFacing;
-+    psContext.primID = work.triFlags.primID;
-+
-+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
-+    psContext.I = work.I;
-+    psContext.J = work.J;
-+    psContext.recipDet = work.recipDet;
-+    psContext.pSamplePos = work.pSamplePos;
-+    psContext.sampleIndex = 0;
-+
-+    const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
-+    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-+    {
-+        simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
-+        simdscalar vYSamplePosCenter = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
-+        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-+        {
-+            simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
-+            simdscalar vXSamplePosCenter = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
-+
-+            // if oDepth written to, or there is a potential to discard any samples, we need to 
-+            // run the PS early, then interp or broadcast Z and test
-+            if(pPSState->writesODepth || pPSState->killsPixel)
-+            {
-+                RDTSC_START(BEBarycentric);
-+                // set pixel center positions
-+                psContext.vX = vXSamplePosCenter;
-+                psContext.vY = vYSamplePosCenter;
-+
-+                // evaluate I, J at pixel center
-+                psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY);
-+                psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY);
-+                psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet);
-+                psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet);
-+
-+                // interpolate z
-+                psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ);
-+
-+                RDTSC_STOP(BEBarycentric, 0, 0);
-+
-+                // interpolate 1/w
-+                psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ);
-+
-+                /// @todo: sampleMask / inputcoverage
-+                // for now just pass in all 1s
-+                psContext.mask = _simd_set1_epi32(-1);
-+
-+                // execute pixel shader
-+                RDTSC_START(BEPixelShader);
-+                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-+                RDTSC_STOP(BEPixelShader, 0, 0);
-+            }
-+            else
-+            {
-+                /// @todo: sampleMask / inputcoverage
-+                // for now just through full pixel output
-+                psContext.mask = _simd_set1_epi32(-1);
-+            }
-+
-+            simdscalar depthPassMask[numSamples];
-+            simdscalar anyDepthSamplePassed = _simd_setzero_ps();
-+            for(uint32_t sample = 0; sample < numSamples; sample++)
-+            {
-+                /// @todo: sampleMask / inputcoverage
-+                depthPassMask[sample] = vMask(work.coverageMask[sample] & MASK);
-+                // pull mask back out for any discards and and with coverage
-+                depthPassMask[sample] = _simd_and_ps(depthPassMask[sample], _simd_castsi_ps(psContext.mask));
-+
-+                if (!_simd_movemask_ps(depthPassMask[sample]))
-+                {
-+                    depthPassMask[sample] = _simd_setzero_ps();
-+                    continue;
-+                }
-+
-+                // if oDepth isn't written to, we need to interpolate Z for each sample
-+                // if clip distances are enabled, we need to interpolate for each sample
-+                if(!pPSState->writesODepth || rastState.clipDistanceMask)
-+                {
-+                    RDTSC_START(BEBarycentric);
-+                    // calculate per sample positions
-+                    simdscalar vSamplePosX = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
-+                    simdscalar vSamplePosY = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
-+
-+                    // evaluate I,J at sample positions
-+                    psContext.vI = vplaneps(vIa, vIb, vIc, vSamplePosX, vSamplePosY);
-+                    psContext.vJ = vplaneps(vJa, vJb, vJc, vSamplePosX, vSamplePosY);
-+                    psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet);
-+                    psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet);
-+
-+                    // interpolate z
-+                    if (!pPSState->writesODepth)
-+                    {
-+                        psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ);
-+                    }
-+                    
-+                    // interpolate clip distances
-+                    if (rastState.clipDistanceMask)
-+                    {
-+                        uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
-+                            psContext.vI, psContext.vJ);
-+                        depthPassMask[sample] = _simd_and_ps(depthPassMask[sample], vMask(~clipMask));
-+                    }
-+                    RDTSC_STOP(BEBarycentric, 0, 0);
-+                }
-+                // else 'broadcast' and test psContext.vZ from the PS invocation for each sample
-+
-+                // offset depth/stencil buffers current sample
-+                uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
-+                uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
-+
-+                // ZTest for this sample
-+                RDTSC_START(BEEarlyDepthTest);
-+                depthPassMask[sample] = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
-+                                        psContext.vZ, pDepthSample, depthPassMask[sample], pStencilSample, false);
-+                RDTSC_STOP(BEEarlyDepthTest, 0, 0);
-+
-+                anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
-+
-+                uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
-+                uint32_t statCount = _mm_popcnt_u32(statMask);
-+                UPDATE_STAT(DepthPassCount, statCount);
-+            }
-+
-+            // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
-+            if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
-+            {
-+                RDTSC_START(BEBarycentric);
-+                // set pixel center positions
-+                psContext.vX = vXSamplePosCenter;
-+                psContext.vY = vYSamplePosCenter;
-+
-+                // evaluate I,J at pixel center
-+                psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY);
-+                psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY);
-+                psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet);
-+                psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet);
-+
-+                // interpolate z
-+                psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ);
-+                RDTSC_STOP(BEBarycentric, 0, 0);
-+
-+                // interpolate 1/w
-+                psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ);
-+
-+                // execute pixel shader
-+                RDTSC_START(BEPixelShader);
-+                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-+                RDTSC_STOP(BEPixelShader, 0, 0);
-+            }
-+            else
-+            {
-+                goto Endtile;
-+            }
-+
-+            // loop over all samples, broadcasting the results of the PS to all passing pixels
-+            for(uint32_t sample = 0; sample < numSamples; sample++)
-+            {
-+                if(sampleCount != SWR_MULTISAMPLE_1X)
-+                {
-+                    if(rastState.isSampleMasked[sample])
-+                        continue;
-+                }
-+
-+                // output merger
-+                RDTSC_START(BEOutputMerger);
-+                // skip if none of the pixels for this sample passed
-+                if(!_simd_movemask_ps(depthPassMask[sample]))
-+                {
-+                    depthPassMask[sample] = _simd_setzero_ps();
-+                    continue;
-+                }
-+                simdscalari mask = _simd_castps_si(depthPassMask[sample]);
-+                uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
-+                for(uint32_t rt = 0; rt <= MaxRT; ++rt)
-+                {
-+                    uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
-+
-+                    const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-+
-+                    // Blend outputs
-+                    if(pRTBlend->colorBlendEnable)
-+                    {
-+                        state.pfnBlendFunc[rt](pBlendState, psContext.shaded[rt], psContext.shaded[1], pColorSample, psContext.shaded[rt]);
-+                    }
-+
-+                    ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-+                    static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-+
-+                    const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-+
-+                    // store with color mask
-+                    if(!pRTBlend->writeDisableRed)
-+                    {
-+                        _simd_maskstore_ps((float*)pColorSample, mask, psContext.shaded[rt].x);
-+                    }
-+                    if(!pRTBlend->writeDisableGreen)
-+                    {
-+                        _simd_maskstore_ps((float*)(pColorSample + simd), mask, psContext.shaded[rt].y);
-+                    }
-+                    if(!pRTBlend->writeDisableBlue)
-+                    {
-+                        _simd_maskstore_ps((float*)(pColorSample + simd * 2), mask, psContext.shaded[rt].z);
-+                    }
-+                    if(!pRTBlend->writeDisableAlpha)
-+                    {
-+                        _simd_maskstore_ps((float*)(pColorSample + simd * 3), mask, psContext.shaded[rt].w);
-+                    }
-+                }
-+                RDTSC_STOP(BEOutputMerger, 0, 0);
-+            }
-+
-+Endtile:
-+            RDTSC_START(BEEndTile);
-+            for(uint32_t sample = 0; sample < numSamples; sample++)
-+            {
-+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-+            }
-+
-+            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-+            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-+
-+            for(uint32_t rt = 0; rt <= MaxRT; ++rt)
-+            {
-+                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-+            }
-+            RDTSC_STOP(BEEndTile, 0, 0);
-+        }
-+    }
-+}
-+// optimized backend flow with NULL PS
-+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-+{
-+    RDTSC_START(BESetup);
-+
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+    const API_STATE& state = GetApiState(pDC);
-+    // todo multisample
-+    uint64_t coverageMask = work.coverageMask[0];
-+
-+    // broadcast scalars
-+    simdscalar vIa = _simd_broadcast_ss(&work.I[0]);
-+    simdscalar vIb = _simd_broadcast_ss(&work.I[1]);
-+    simdscalar vIc = _simd_broadcast_ss(&work.I[2]);
-+
-+    simdscalar vJa = _simd_broadcast_ss(&work.J[0]);
-+    simdscalar vJb = _simd_broadcast_ss(&work.J[1]);
-+    simdscalar vJc = _simd_broadcast_ss(&work.J[2]);
-+
-+    simdscalar vZa = _simd_broadcast_ss(&work.Z[0]);
-+    simdscalar vZb = _simd_broadcast_ss(&work.Z[1]);
-+    simdscalar vZc = _simd_broadcast_ss(&work.Z[2]);
-+
-+    simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet);
-+
-+    BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
-+
-+    RDTSC_STOP(BESetup, 0, 0);
-+
-+    SWR_PS_CONTEXT psContext;
-+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-+    {
-+        psContext.vY = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
-+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-+        {
-+            if (coverageMask & MASK)
-+            {
-+                RDTSC_START(BEBarycentric);
-+
-+                // calculate pixel positions
-+                psContext.vX = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
-+
-+                // evaluate I,J
-+                psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY);
-+                psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY);
-+                psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet);
-+                psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet);
-+
-+                // interpolate z
-+                psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ);
-+
-+                RDTSC_STOP(BEBarycentric, 0, 0);
-+
-+                simdscalar depthPassMask = vMask(coverageMask & MASK);
-+                RDTSC_START(BEEarlyDepthTest);
-+                depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
-+                                      psContext.vZ, pDepthBase, depthPassMask, pStencilBase, false);
-+                RDTSC_STOP(BEEarlyDepthTest, 0, 0);
-+
-+                uint32_t statMask = _simd_movemask_ps(depthPassMask);
-+                uint32_t statCount = _mm_popcnt_u32(statMask);
-+                UPDATE_STAT(DepthPassCount, statCount);
-+            }
-+            coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-+            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-+            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-+        }
-+    }
-+}
-+
-+void InitClearTilesTable()
-+{
-+    memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
-+
-+    sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
-+    sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
-+    sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
-+    sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
-+    sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
-+}
-+
-+// initialize backend function tables
-+PFN_BACKEND_FUNC gSingleSampleBackendTable[] = {
-+    BackendSampleRate<0, SWR_MULTISAMPLE_1X>,
-+    BackendSampleRate<1, SWR_MULTISAMPLE_1X>,
-+    BackendSampleRate<2, SWR_MULTISAMPLE_1X>,
-+    BackendSampleRate<3, SWR_MULTISAMPLE_1X>,
-+    BackendSampleRate<4, SWR_MULTISAMPLE_1X>,
-+    BackendSampleRate<5, SWR_MULTISAMPLE_1X>,
-+    BackendSampleRate<6, SWR_MULTISAMPLE_1X>,
-+    BackendSampleRate<7, SWR_MULTISAMPLE_1X>,
-+};
-+
-+// MSAA per sample shading rate
-+PFN_BACKEND_FUNC gSampleRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS] ={
-+    {
-+        BackendSampleRate<0, SWR_MULTISAMPLE_2X>,
-+        BackendSampleRate<1, SWR_MULTISAMPLE_2X>,
-+        BackendSampleRate<2, SWR_MULTISAMPLE_2X>,
-+        BackendSampleRate<3, SWR_MULTISAMPLE_2X>,
-+        BackendSampleRate<4, SWR_MULTISAMPLE_2X>,
-+        BackendSampleRate<5, SWR_MULTISAMPLE_2X>,
-+        BackendSampleRate<6, SWR_MULTISAMPLE_2X>,
-+        BackendSampleRate<7, SWR_MULTISAMPLE_2X>,
-+    },
-+    {
-+        BackendSampleRate<0, SWR_MULTISAMPLE_4X>,
-+        BackendSampleRate<1, SWR_MULTISAMPLE_4X>,
-+        BackendSampleRate<2, SWR_MULTISAMPLE_4X>,
-+        BackendSampleRate<3, SWR_MULTISAMPLE_4X>,
-+        BackendSampleRate<4, SWR_MULTISAMPLE_4X>,
-+        BackendSampleRate<5, SWR_MULTISAMPLE_4X>,
-+        BackendSampleRate<6, SWR_MULTISAMPLE_4X>,
-+        BackendSampleRate<7, SWR_MULTISAMPLE_4X>,
-+    },
-+    {
-+        BackendSampleRate<0, SWR_MULTISAMPLE_8X>,
-+        BackendSampleRate<1, SWR_MULTISAMPLE_8X>,
-+        BackendSampleRate<2, SWR_MULTISAMPLE_8X>,
-+        BackendSampleRate<3, SWR_MULTISAMPLE_8X>,
-+        BackendSampleRate<4, SWR_MULTISAMPLE_8X>,
-+        BackendSampleRate<5, SWR_MULTISAMPLE_8X>,
-+        BackendSampleRate<6, SWR_MULTISAMPLE_8X>,
-+        BackendSampleRate<7, SWR_MULTISAMPLE_8X>,
-+    },
-+    {
-+        BackendSampleRate<0, SWR_MULTISAMPLE_16X>,
-+        BackendSampleRate<1, SWR_MULTISAMPLE_16X>,
-+        BackendSampleRate<2, SWR_MULTISAMPLE_16X>,
-+        BackendSampleRate<3, SWR_MULTISAMPLE_16X>,
-+        BackendSampleRate<4, SWR_MULTISAMPLE_16X>,
-+        BackendSampleRate<5, SWR_MULTISAMPLE_16X>,
-+        BackendSampleRate<6, SWR_MULTISAMPLE_16X>,
-+        BackendSampleRate<7, SWR_MULTISAMPLE_16X>,
-+    }
-+};
-+
-+// MSAA per pixel shading rate
-+PFN_BACKEND_FUNC gPixelRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS] ={
-+    {
-+        BackendPixelRate<0, SWR_MULTISAMPLE_2X>,
-+        BackendPixelRate<1, SWR_MULTISAMPLE_2X>,
-+        BackendPixelRate<2, SWR_MULTISAMPLE_2X>,
-+        BackendPixelRate<3, SWR_MULTISAMPLE_2X>,
-+        BackendPixelRate<4, SWR_MULTISAMPLE_2X>,
-+        BackendPixelRate<5, SWR_MULTISAMPLE_2X>,
-+        BackendPixelRate<6, SWR_MULTISAMPLE_2X>,
-+        BackendPixelRate<7, SWR_MULTISAMPLE_2X>,
-+    },
-+    {
-+        BackendPixelRate<0, SWR_MULTISAMPLE_4X>,
-+        BackendPixelRate<1, SWR_MULTISAMPLE_4X>,
-+        BackendPixelRate<2, SWR_MULTISAMPLE_4X>,
-+        BackendPixelRate<3, SWR_MULTISAMPLE_4X>,
-+        BackendPixelRate<4, SWR_MULTISAMPLE_4X>,
-+        BackendPixelRate<5, SWR_MULTISAMPLE_4X>,
-+        BackendPixelRate<6, SWR_MULTISAMPLE_4X>,
-+        BackendPixelRate<7, SWR_MULTISAMPLE_4X>,
-+    },
-+    {
-+        BackendPixelRate<0, SWR_MULTISAMPLE_8X>,
-+        BackendPixelRate<1, SWR_MULTISAMPLE_8X>,
-+        BackendPixelRate<2, SWR_MULTISAMPLE_8X>,
-+        BackendPixelRate<3, SWR_MULTISAMPLE_8X>,
-+        BackendPixelRate<4, SWR_MULTISAMPLE_8X>,
-+        BackendPixelRate<5, SWR_MULTISAMPLE_8X>,
-+        BackendPixelRate<6, SWR_MULTISAMPLE_8X>,
-+        BackendPixelRate<7, SWR_MULTISAMPLE_8X>,
-+    },
-+    {
-+        BackendPixelRate<0, SWR_MULTISAMPLE_16X>,
-+        BackendPixelRate<1, SWR_MULTISAMPLE_16X>,
-+        BackendPixelRate<2, SWR_MULTISAMPLE_16X>,
-+        BackendPixelRate<3, SWR_MULTISAMPLE_16X>,
-+        BackendPixelRate<4, SWR_MULTISAMPLE_16X>,
-+        BackendPixelRate<5, SWR_MULTISAMPLE_16X>,
-+        BackendPixelRate<6, SWR_MULTISAMPLE_16X>,
-+        BackendPixelRate<7, SWR_MULTISAMPLE_16X>,
-+    }
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
-new file mode 100644
-index 0000000..218f5c0
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
-@@ -0,0 +1,45 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file backend.h
-+*
-+* @brief Backend handles rasterization, pixel shading and output merger
-+*        operations.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/os.h"
-+#include "core/context.h" 
-+
-+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
-+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
-+void InitClearTilesTable();
-+
-+extern PFN_BACKEND_FUNC gSingleSampleBackendTable[];
-+extern PFN_BACKEND_FUNC gSampleRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS];
-+extern PFN_BACKEND_FUNC gPixelRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS];
-diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h
-new file mode 100644
-index 0000000..626c237
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h
-@@ -0,0 +1,318 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file blend.cpp
-+*
-+* @brief Implementation for blending operations.
-+*
-+******************************************************************************/
-+#include "state.h"
-+
-+template<bool Color, bool Alpha>
-+INLINE
-+void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out)
-+{
-+    simdvector result;
-+
-+    switch (func)
-+    {
-+    case BLENDFACTOR_ZERO: 
-+        result.x = _simd_setzero_ps();
-+        result.y = _simd_setzero_ps();
-+        result.z = _simd_setzero_ps();
-+        result.w = _simd_setzero_ps();
-+        break;
-+
-+    case BLENDFACTOR_ONE: 
-+        result.x = _simd_set1_ps(1.0);
-+        result.y = _simd_set1_ps(1.0);
-+        result.z = _simd_set1_ps(1.0);
-+        result.w = _simd_set1_ps(1.0);
-+        break;
-+
-+    case BLENDFACTOR_SRC_COLOR: 
-+        result = src;
-+        break;
-+
-+    case BLENDFACTOR_DST_COLOR: 
-+        result = dst;
-+        break;
-+
-+    case BLENDFACTOR_INV_SRC_COLOR: 
-+        result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
-+        result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
-+        result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
-+        result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
-+        break;
-+
-+    case BLENDFACTOR_INV_DST_COLOR: 
-+        result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
-+        result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
-+        result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
-+        result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
-+        break;
-+
-+    case BLENDFACTOR_SRC_ALPHA: result.x = src.w;
-+        result.y = src.w;
-+        result.z = src.w;
-+        result.w = src.w;
-+        break;
-+
-+    case BLENDFACTOR_INV_SRC_ALPHA:
-+    {
-+        simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
-+        result.x = oneMinusSrcA;
-+        result.y = oneMinusSrcA;
-+        result.z = oneMinusSrcA;
-+        result.w = oneMinusSrcA;
-+        break;
-+    }
-+
-+    case BLENDFACTOR_DST_ALPHA: result.x = dst.w;
-+        result.y = dst.w;
-+        result.z = dst.w;
-+        result.w = dst.w;
-+        break;
-+
-+    case BLENDFACTOR_INV_DST_ALPHA:
-+    {
-+        simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
-+        result.x = oneMinusDstA;
-+        result.y = oneMinusDstA;
-+        result.z = oneMinusDstA;
-+        result.w = oneMinusDstA;
-+        break;
-+    }
-+
-+    case BLENDFACTOR_SRC_ALPHA_SATURATE:
-+    {
-+        simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
-+        result.x = sat;
-+        result.y = sat;
-+        result.z = sat;
-+        result.w = _simd_set1_ps(1.0);
-+        break;
-+    }
-+
-+    case BLENDFACTOR_CONST_COLOR:
-+        result.x = constantColor[0];
-+        result.y = constantColor[1];
-+        result.z = constantColor[2];
-+        result.w = constantColor[3];
-+        break;
-+
-+    case BLENDFACTOR_CONST_ALPHA:
-+        result.x = result.y = result.z = result.w = constantColor[3];
-+        break;
-+
-+    case BLENDFACTOR_INV_CONST_COLOR:
-+    {
-+        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
-+        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
-+        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
-+        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
-+        break;
-+    }
-+
-+    case BLENDFACTOR_INV_CONST_ALPHA:
-+    {
-+        result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
-+        break;
-+    }
-+
-+    case BLENDFACTOR_SRC1_COLOR:
-+        result.x = src1.x;
-+        result.y = src1.y;
-+        result.z = src1.z;
-+        result.w = src1.w;
-+        break;
-+
-+    case BLENDFACTOR_SRC1_ALPHA:
-+        result.x = result.y = result.z = result.w = src1.w;
-+        break;
-+
-+    case BLENDFACTOR_INV_SRC1_COLOR:
-+        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
-+        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
-+        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
-+        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
-+        break;
-+
-+    case BLENDFACTOR_INV_SRC1_ALPHA:
-+        result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
-+        break;
-+
-+    default: SWR_ASSERT(false, "Unimplemented blend factor: %d", func);
-+    }
-+
-+    if (Color)
-+    {
-+        out.x = result.x;
-+        out.y = result.y;
-+        out.z = result.z;
-+    }
-+    if (Alpha)
-+    {
-+        out.w = result.w;
-+    }
-+
-+}
-+
-+template<bool Color, bool Alpha>
-+INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out)
-+{
-+    simdvector result;
-+
-+    switch (blendOp)
-+    {
-+    case BLENDOP_ADD:
-+        result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
-+        result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
-+        result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
-+        result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
-+        break;
-+
-+    case BLENDOP_SUBTRACT:
-+        result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
-+        result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
-+        result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
-+        result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
-+        break;
-+
-+    case BLENDOP_REVSUBTRACT:
-+        result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
-+        result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
-+        result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
-+        result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
-+        break;
-+        
-+    case BLENDOP_MIN:
-+        result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
-+        result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
-+        result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
-+        result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
-+        break;
-+        
-+    case BLENDOP_MAX:
-+        result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
-+        result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
-+        result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
-+        result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
-+        break;
-+        
-+    default:
-+        SWR_ASSERT(false, "Unimplemented blend function: %d", blendOp);
-+    }
-+
-+    if (Color)
-+    {
-+        out.x = result.x;
-+        out.y = result.y;
-+        out.z = result.z;
-+    }
-+    if (Alpha)
-+    {
-+        out.w = result.w;
-+    }
-+}
-+
-+template<SWR_TYPE type>
-+INLINE void Clamp(simdvector &src)
-+{
-+    switch (type)
-+    {
-+    case SWR_TYPE_FLOAT:
-+        break;
-+
-+    case SWR_TYPE_UNORM:
-+        src.x = _simd_max_ps(src.x, _simd_setzero_ps());
-+        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
-+
-+        src.y = _simd_max_ps(src.y, _simd_setzero_ps());
-+        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
-+
-+        src.z = _simd_max_ps(src.z, _simd_setzero_ps());
-+        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
-+
-+        src.w = _simd_max_ps(src.w, _simd_setzero_ps());
-+        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
-+        break;
-+
-+    case SWR_TYPE_SNORM:
-+        src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
-+        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
-+
-+        src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
-+        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
-+
-+        src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
-+        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
-+
-+        src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
-+        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
-+        break;
-+
-+    default:
-+        SWR_ASSERT(false, "Unimplemented clamp: %d", type);
-+        break;
-+    }
-+}
-+
-+template<SWR_TYPE type>
-+void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result)
-+{
-+    // load render target
-+    simdvector dst;
-+    LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
-+
-+    simdvector constColor;
-+    constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
-+    constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
-+    constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
-+    constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
-+
-+    // clamp src/dst/constant
-+    Clamp<type>(src);
-+    Clamp<type>(src1);
-+    Clamp<type>(dst);
-+    Clamp<type>(constColor);
-+
-+    simdvector srcFactor, dstFactor;
-+    if (pBlendState->independentAlphaBlendEnable)
-+    {
-+        GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-+        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor);
-+
-+        GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
-+        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
-+
-+        BlendFunc<true, false>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-+        BlendFunc<false, true>((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
-+    }
-+    else
-+    {
-+        GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-+        GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
-+
-+        BlendFunc<true, true>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-+    }
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
-new file mode 100644
-index 0000000..ce27bf7
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
-@@ -0,0 +1,201 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file clip.cpp
-+*
-+* @brief Implementation for clipping
-+*
-+******************************************************************************/
-+
-+#include <assert.h>
-+
-+#include "common/os.h"
-+#include "core/clip.h"
-+
-+float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
-+{
-+    return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
-+}
-+
-+template<SWR_CLIPCODES ClippingPlane>
-+inline void intersect(
-+    int s,                       // index to first edge vertex v0 in pInPts.
-+    int p,                       // index to second edge vertex v1 in pInPts.
-+    const float *pInPts,         // array of all the input positions.
-+    const float *pInAttribs,     // array of all attributes for all vertex. All the attributes for each vertex is contiguous.
-+    int numInAttribs,            // number of attributes per vertex.
-+    int i,                       // output index.
-+    float *pOutPts,              // array of output positions. We'll write our new intersection point at i*4.
-+    float *pOutAttribs)          // array of output attributes. We'll write our new attributes at i*numInAttribs.
-+{
-+    float t;
-+
-+    // Find the parameter of the intersection.
-+    //        t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
-+    const float *v1 = &pInPts[s*4];
-+    const float *v2 = &pInPts[p*4];
-+
-+    switch (ClippingPlane)
-+    {
-+    case FRUSTUM_LEFT:      t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break;
-+    case FRUSTUM_RIGHT:     t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break;
-+    case FRUSTUM_TOP:       t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break;
-+    case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break;
-+    case FRUSTUM_NEAR:      t = ComputeInterpFactor(v1[2], v2[2]); break;
-+    case FRUSTUM_FAR:       t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break;
-+    default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
-+    };
-+
-+
-+    const float *a1 = &pInAttribs[s*numInAttribs];
-+    const float *a2 = &pInAttribs[p*numInAttribs];
-+
-+    float *pOutP    = &pOutPts[i*4];
-+    float *pOutA    = &pOutAttribs[i*numInAttribs];
-+
-+    // Interpolate new position.
-+    for(int j = 0; j < 4; ++j)
-+    {
-+        pOutP[j] = v1[j] + (v2[j]-v1[j])*t;
-+    }
-+
-+    // Interpolate Attributes
-+    for(int attr = 0; attr < numInAttribs; ++attr)
-+    {
-+        pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t;
-+    }
-+}
-+
-+
-+// Checks whether vertex v lies inside clipping plane
-+// in homogenous coords check -w < {x,y,z} < w;
-+//
-+template<SWR_CLIPCODES ClippingPlane>
-+inline int inside(const float v[4])
-+{
-+    switch (ClippingPlane)
-+    {
-+    case FRUSTUM_LEFT   : return (v[0]>=-v[3]);
-+    case FRUSTUM_RIGHT  : return (v[0]<= v[3]);
-+    case FRUSTUM_TOP    : return (v[1]>=-v[3]);
-+    case FRUSTUM_BOTTOM : return (v[1]<= v[3]);
-+    case FRUSTUM_NEAR   : return (v[2]>=0.0f);
-+    case FRUSTUM_FAR    : return (v[2]<= v[3]);
-+    default:
-+        SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
-+        return 0;
-+    }
-+}
-+
-+
-+// Clips a polygon in homogenous coordinates to a particular clipping plane.
-+// Takes in vertices of the polygon (InPts) and the clipping plane
-+// Puts the vertices of the clipped polygon in OutPts
-+// Returns number of points in clipped polygon
-+//
-+template<SWR_CLIPCODES ClippingPlane>
-+int ClipTriToPlane( const float *pInPts, int numInPts,
-+                    const float *pInAttribs, int numInAttribs,
-+                    float *pOutPts, float *pOutAttribs)
-+{
-+    int i=0; // index number of OutPts, # of vertices in OutPts = i div 4;
-+
-+    for (int j = 0; j < numInPts; ++j)
-+    {
-+        int s = j;
-+        int p = (j + 1) % numInPts;
-+
-+        int s_in = inside<ClippingPlane>(&pInPts[s*4]);
-+        int p_in = inside<ClippingPlane>(&pInPts[p*4]);
-+
-+        // test if vertex is to be added to output vertices
-+        if (s_in != p_in)  // edge crosses clipping plane
-+        {
-+            // find point of intersection
-+            intersect<ClippingPlane>(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
-+            i++;
-+        }
-+        if (p_in) // 2nd vertex is inside clipping volume, add it to output
-+        {
-+            // Copy 2nd vertex position of edge over to output.
-+            for(int k = 0; k < 4; ++k)
-+            {
-+                pOutPts[i*4 + k] = pInPts[p*4 + k];
-+            }
-+            // Copy 2nd vertex attributes of edge over to output.
-+            for(int attr = 0; attr < numInAttribs; ++attr)
-+            {
-+                pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr];
-+            }
-+            i++;
-+        }
-+        // edge does not cross clipping plane and vertex outside clipping volume
-+        //  => do not add vertex
-+    }
-+    return i;
-+}
-+
-+
-+
-+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
-+{
-+    // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
-+    OSALIGN(float, 16) tempPts[6 * 4];
-+    OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
-+
-+    // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
-+    int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
-+    NumOutPts = ClipTriToPlane<FRUSTUM_FAR>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
-+    NumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
-+    NumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
-+    NumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
-+    NumOutPts = ClipTriToPlane<FRUSTUM_TOP>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
-+
-+    SWR_ASSERT(NumOutPts <= 6);
-+
-+    *numVerts = NumOutPts;
-+    return;
-+}
-+
-+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
-+{
-+    RDTSC_START(FEClipTriangles);
-+    Clipper<3> clipper(workerId, pDC);
-+    clipper.ExecuteStage(pa, prims, primMask, primId);
-+    RDTSC_STOP(FEClipTriangles, 1, 0);
-+}
-+
-+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
-+{
-+    RDTSC_START(FEClipLines);
-+    Clipper<2> clipper(workerId, pDC);
-+    clipper.ExecuteStage(pa, prims, primMask, primId);
-+    RDTSC_STOP(FEClipLines, 1, 0);
-+}
-+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
-+{
-+    RDTSC_START(FEClipPoints);
-+    Clipper<1> clipper(workerId, pDC);
-+    clipper.ExecuteStage(pa, prims, primMask, primId);
-+    RDTSC_STOP(FEClipPoints, 1, 0);
-+}
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
-new file mode 100644
-index 0000000..e9ba71d
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
-@@ -0,0 +1,851 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file clip.h
-+*
-+* @brief Definitions for clipping
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/simdintrin.h"
-+#include "core/context.h"
-+#include "core/pa.h"
-+#include "rdtsc_core.h"
-+
-+enum SWR_CLIPCODES
-+{
-+    // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
-+    // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
-+#define CLIPCODE_SHIFT 23
-+    FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
-+    FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
-+    FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
-+    FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
-+
-+    FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
-+    FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
-+
-+    NEGW            = (0x40 << CLIPCODE_SHIFT),
-+
-+    GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
-+    GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
-+    GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
-+    GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
-+};
-+
-+#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
-+#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
-+
-+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, 
-+          int *numVerts, float *pOutAttribs);
-+
-+INLINE
-+void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes)
-+{
-+    clipCodes = _simd_setzero_ps();
-+
-+    // -w
-+    simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
-+
-+    // FRUSTUM_LEFT
-+    simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
-+    clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
-+
-+    // FRUSTUM_TOP
-+    vRes = _simd_cmplt_ps(vertex.y, vNegW);
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
-+
-+    // FRUSTUM_RIGHT
-+    vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
-+
-+    // FRUSTUM_BOTTOM
-+    vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
-+
-+    if (state.rastState.depthClipEnable)
-+    {
-+        // FRUSTUM_NEAR
-+        // DX clips depth [0..w], GL clips [-w..w]
-+        if (type == DX)
-+        {
-+            vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
-+        }
-+        else
-+        {
-+            vRes = _simd_cmplt_ps(vertex.z, vNegW);
-+        }
-+        clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
-+
-+        // FRUSTUM_FAR
-+        vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
-+        clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
-+    }
-+
-+    // NEGW
-+    vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
-+
-+    // GUARDBAND_LEFT
-+    simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left));
-+    vRes = _simd_cmplt_ps(vertex.x, gbMult);
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
-+
-+    // GUARDBAND_TOP
-+    gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top));
-+    vRes = _simd_cmplt_ps(vertex.y, gbMult);
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
-+
-+    // GUARDBAND_RIGHT
-+    gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right));
-+    vRes = _simd_cmpgt_ps(vertex.x, gbMult);
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
-+
-+    // GUARDBAND_BOTTOM
-+    gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom));
-+    vRes = _simd_cmpgt_ps(vertex.y, gbMult);
-+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
-+}
-+
-+template<uint32_t NumVertsPerPrim>
-+class Clipper
-+{
-+public:
-+    Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
-+        workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC))
-+    {
-+        static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
-+    }
-+
-+    void ComputeClipCodes(simdvector vertex[])
-+    {
-+        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
-+        {
-+            ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]);
-+        }
-+    }
-+
-+    simdscalar ComputeClipCodeIntersection()
-+    {
-+        simdscalar result = this->clipCodes[0];
-+        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
-+        {
-+            result = _simd_and_ps(result, this->clipCodes[i]);
-+        }
-+        return result;
-+    }
-+
-+    simdscalar ComputeClipCodeUnion()
-+    {
-+        simdscalar result = this->clipCodes[0];
-+        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
-+        {
-+            result = _simd_or_ps(result, this->clipCodes[i]);
-+        }
-+        return result;
-+    }
-+
-+    int ComputeNegWMask()
-+    {
-+        simdscalar clipCodeUnion = ComputeClipCodeUnion();
-+        clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
-+        return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
-+    }
-+
-+    int ComputeClipMask()
-+    {
-+        simdscalar clipUnion = ComputeClipCodeUnion();
-+        clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
-+        return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
-+    }
-+
-+    // clipper is responsible for culling any prims with NAN coordinates
-+    int ComputeNaNMask(simdvector prim[])
-+    {
-+        simdscalar vNanMask = _simd_setzero_ps();
-+        for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
-+        {
-+            simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
-+            vNanMask = _simd_or_ps(vNanMask, vNan01);
-+            simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
-+            vNanMask = _simd_or_ps(vNanMask, vNan23);
-+        }
-+
-+        return _simd_movemask_ps(vNanMask);
-+    }
-+
-+    int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
-+    {
-+        uint8_t cullMask = this->state.rastState.cullDistanceMask;
-+        simdscalar vClipCullMask = _simd_setzero_ps();
-+        DWORD index;
-+
-+        simdvector vClipCullDistLo[3];
-+        simdvector vClipCullDistHi[3];
-+
-+        pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
-+        pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
-+        while (_BitScanForward(&index, cullMask))
-+        {
-+            cullMask &= ~(1 << index);
-+            uint32_t slot = index >> 2;
-+            uint32_t component = index & 0x3;
-+
-+            simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
-+            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
-+            {
-+                simdscalar vCullComp;
-+                if (slot == 0)
-+                {
-+                    vCullComp = vClipCullDistLo[e][component];
-+                }
-+                else
-+                {
-+                    vCullComp = vClipCullDistHi[e][component];
-+                }
-+
-+                // cull if cull distance < 0 || NAN
-+                simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
-+                vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
-+            }
-+            vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
-+        }
-+
-+        // clipper should also discard any primitive with NAN clip distance
-+        uint8_t clipMask = this->state.rastState.clipDistanceMask;
-+        while (_BitScanForward(&index, clipMask))
-+        {
-+            clipMask &= ~(1 << index);
-+            uint32_t slot = index >> 2;
-+            uint32_t component = index & 0x3;
-+
-+            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
-+            {
-+                simdscalar vClipComp;
-+                if (slot == 0)
-+                {
-+                    vClipComp = vClipCullDistLo[e][component];
-+                }
-+                else
-+                {
-+                    vClipComp = vClipCullDistHi[e][component];
-+                }
-+
-+                simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
-+                vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
-+            }
-+        }
-+
-+        return _simd_movemask_ps(vClipCullMask);
-+    }
-+
-+    // clip a single primitive
-+    int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
-+    {
-+        OSALIGN(float, 16) inVerts[3 * 4];
-+        OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
-+
-+        // transpose primitive position
-+        __m128 verts[3];
-+        pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts);
-+        _mm_store_ps(&inVerts[0], verts[0]);
-+        _mm_store_ps(&inVerts[4], verts[1]);
-+        _mm_store_ps(&inVerts[8], verts[2]);
-+
-+        // transpose attribs
-+        uint32_t numScalarAttribs = this->state.linkageCount * 4;
-+
-+        int idx = 0;
-+        DWORD slot = 0;
-+        uint32_t mapIdx = 0;
-+        uint32_t tmpLinkage = uint32_t(this->state.linkageMask);
-+        while (_BitScanForward(&slot, tmpLinkage))
-+        {
-+            tmpLinkage &= ~(1 << slot);
-+            // Compute absolute attrib slot in vertex array
-+            uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++];
-+            __m128 attrib[3];    // triangle attribs (always 4 wide)
-+            pa.AssembleSingle(inputSlot, primIndex, attrib);
-+            _mm_store_ps(&inAttribs[idx], attrib[0]);
-+            _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]);
-+            _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]);
-+            idx += 4;
-+        }
-+
-+        int numVerts;
-+        Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs);
-+
-+        return numVerts;
-+    }
-+
-+    // clip SIMD primitives
-+    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
-+    {
-+        // input/output vertex store for clipper
-+        simdvertex vertices[7]; // maximum 7 verts generated per triangle
-+
-+        // assemble pos
-+        simdvector tmpVector[NumVertsPerPrim];
-+        pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
-+        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
-+        {
-+            vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
-+        }
-+
-+        // assemble attribs
-+        DWORD slot = 0;
-+        uint32_t mapIdx = 0;
-+        uint32_t tmpLinkage = this->state.linkageMask;
-+        while (_BitScanForward(&slot, tmpLinkage))
-+        {
-+            tmpLinkage &= ~(1 << slot);
-+            // Compute absolute attrib slot in vertex array
-+            uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++];
-+
-+            pa.Assemble(inputSlot, tmpVector);
-+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
-+            {
-+                vertices[i].attrib[inputSlot] = tmpVector[i];
-+            }
-+        }
-+
-+        uint32_t numAttribs;
-+        if (_BitScanReverse((DWORD*)&numAttribs, this->state.linkageMask))
-+        {
-+            numAttribs++;
-+        }
-+        else
-+        {
-+            numAttribs = 0;
-+        }
-+
-+        simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
-+
-+        // set up new PA for binning clipped primitives
-+        PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
-+        PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
-+        if (NumVertsPerPrim == 3)
-+        {
-+            pfnBinFunc = BinTriangles;
-+            clipTopology = TOP_TRIANGLE_FAN;
-+
-+            // so that the binner knows to bloat wide points later
-+            if (pa.binTopology == TOP_POINT_LIST)
-+                clipTopology = TOP_POINT_LIST;
-+        }
-+        else if (NumVertsPerPrim == 2)
-+        {
-+            pfnBinFunc = BinLines;
-+            clipTopology = TOP_LINE_LIST;
-+        }
-+        else
-+        {
-+            SWR_ASSERT(0 && "Unexpected points in clipper.");
-+        }
-+        
-+
-+        uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
-+        uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
-+
-+        const simdscalari vOffsets = _mm256_set_epi32(
-+            0 * sizeof(simdvertex),  // unused lane
-+            6 * sizeof(simdvertex),
-+            5 * sizeof(simdvertex),
-+            4 * sizeof(simdvertex),
-+            3 * sizeof(simdvertex),
-+            2 * sizeof(simdvertex),
-+            1 * sizeof(simdvertex),
-+            0 * sizeof(simdvertex));
-+
-+        // only need to gather 7 verts
-+        // @todo dynamic mask based on actual # of verts generated per lane
-+        const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
-+
-+        uint32_t numClippedPrims = 0;
-+        for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
-+        {
-+            uint32_t numEmittedVerts = pVertexCount[inputPrim];
-+            if (numEmittedVerts < NumVertsPerPrim)
-+            {
-+                continue;
-+            }
-+            SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
-+
-+            uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
-+            numClippedPrims += numEmittedPrims;
-+
-+            // tranpose clipper output so that each lane's vertices are in SIMD order
-+            // set aside space for 2 vertices, as the PA will try to read up to 16 verts
-+            // for triangle fan
-+            simdvertex transposedPrims[2];
-+
-+            // transpose pos
-+            uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
-+                pBase += sizeof(simdscalar);
-+            }
-+
-+            // transpose attribs
-+            pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
-+            for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
-+            {
-+                uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
-+                for (uint32_t c = 0; c < 4; ++c)
-+                {
-+                    transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
-+                    pBase += sizeof(simdscalar);
-+                }
-+            }
-+
-+            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
-+
-+            while (clipPa.GetNextStreamOutput())
-+            {
-+                do
-+                {
-+                    simdvector attrib[NumVertsPerPrim];
-+                    bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
-+                    if (assemble)
-+                    {
-+                        static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
-+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
-+                    }
-+                } while (clipPa.NextPrim());
-+            }
-+        }
-+
-+        // update global pipeline stat
-+        SWR_CONTEXT* pContext = this->pDC->pContext;
-+        UPDATE_STAT(CPrimitives, numClippedPrims);
-+    }
-+    
-+    // execute the clipper stage
-+    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
-+    {
-+        // set up binner based on PA state
-+        PFN_PROCESS_PRIMS pfnBinner;
-+        switch (pa.binTopology)
-+        {
-+        case TOP_POINT_LIST:
-+            pfnBinner = CanUseSimplePoints(pDC) ? BinPoints : BinTriangles;
-+            break;
-+        case TOP_LINE_LIST:
-+        case TOP_LINE_STRIP:
-+        case TOP_LINE_LOOP:
-+        case TOP_LINE_LIST_ADJ:
-+        case TOP_LISTSTRIP_ADJ:
-+            pfnBinner = BinLines;
-+            break;
-+        default:
-+            pfnBinner = BinTriangles;
-+            break;
-+        };
-+
-+        // update clipper invocations pipeline stat
-+        SWR_CONTEXT* pContext = this->pDC->pContext;
-+        uint32_t numInvoc = _mm_popcnt_u32(primMask);
-+        UPDATE_STAT(CInvocations, numInvoc);
-+
-+        ComputeClipCodes(prim);
-+
-+        // cull prims with NAN coords
-+        primMask &= ~ComputeNaNMask(prim);
-+
-+        // user cull distance cull 
-+        if (this->state.rastState.cullDistanceMask)
-+        {
-+            primMask &= ~ComputeUserClipCullMask(pa, prim);
-+        }
-+
-+        // cull prims outside view frustum
-+        simdscalar clipIntersection = ComputeClipCodeIntersection();
-+        int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
-+
-+        // skip clipping for points
-+        uint32_t clipMask = 0;
-+        if (NumVertsPerPrim != 1)
-+        {
-+            clipMask = primMask & ComputeClipMask();
-+        }
-+
-+        if (clipMask)
-+        {
-+            RDTSC_START(FEGuardbandClip);
-+            // we have to clip tris, execute the clipper, which will also
-+            // call the binner
-+            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
-+            RDTSC_STOP(FEGuardbandClip, 1, 0);
-+        }
-+        else if (validMask)
-+        {
-+            // update CPrimitives pipeline state
-+            SWR_CONTEXT* pContext = this->pDC->pContext;
-+            UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
-+
-+            // forward valid prims directly to binner
-+            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
-+        }
-+    }
-+
-+private:
-+    inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
-+    {
-+        return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
-+    }
-+
-+    inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
-+    {
-+        const uint32_t simdVertexStride = sizeof(simdvertex);
-+        const uint32_t componentStride = sizeof(simdscalar);
-+        const uint32_t attribStride = sizeof(simdvector);
-+        const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
-+            3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
-+
-+        // step to the simdvertex
-+        simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
-+
-+        // step to the attribute and component
-+        vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
-+
-+        // step to the lane
-+        vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
-+
-+        return vOffsets;
-+    }
-+
-+    // gathers a single component for a given attribute for each SIMD lane
-+    inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
-+    {
-+        simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
-+        simdscalar vSrc = _mm256_undefined_ps();
-+        return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
-+    }
-+
-+    inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
-+    {
-+        simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
-+
-+        uint32_t* pOffsets = (uint32_t*)&vOffsets;
-+        float* pSrc = (float*)&vSrc;
-+        uint32_t mask = _simd_movemask_ps(vMask);
-+        DWORD lane;
-+        while (_BitScanForward(&lane, mask))
-+        {
-+            mask &= ~(1 << lane);
-+            uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
-+            *(float*)pBuf = pSrc[lane];
-+        }
-+    }
-+
-+    template<SWR_CLIPCODES ClippingPlane>
-+    inline void intersect(
-+        const simdscalar& vActiveMask,  // active lanes to operate on
-+        const simdscalari& s,           // index to first edge vertex v0 in pInPts.
-+        const simdscalari& p,           // index to second edge vertex v1 in pInPts.
-+        const simdvector& v1,           // vertex 0 position
-+        const simdvector& v2,           // vertex 1 position
-+        simdscalari& outIndex,          // output index.
-+        const float *pInVerts,          // array of all the input positions.
-+        uint32_t numInAttribs,          // number of attributes per vertex.
-+        float *pOutVerts)               // array of output positions. We'll write our new intersection point at i*4.
-+    {
-+        // compute interpolation factor
-+        simdscalar t;
-+        switch (ClippingPlane)
-+        {
-+        case FRUSTUM_LEFT:      t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
-+        case FRUSTUM_RIGHT:     t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
-+        case FRUSTUM_TOP:       t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
-+        case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
-+        case FRUSTUM_NEAR:      
-+            // DX Znear plane is 0, GL is -w
-+            if (this->driverType == DX)
-+            {
-+                t = ComputeInterpFactor(v1[2], v2[2]);
-+            }
-+            else
-+            {
-+                t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
-+            }
-+            break;
-+        case FRUSTUM_FAR:       t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
-+        default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
-+        };
-+
-+        // interpolate position and store
-+        for (uint32_t c = 0; c < 4; ++c)
-+        {
-+            simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
-+            ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
-+        }
-+
-+        // interpolate attributes and store
-+        for (uint32_t a = 0; a < numInAttribs; ++a)
-+        {
-+            uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
-+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
-+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
-+            }
-+        }
-+    }
-+
-+    template<SWR_CLIPCODES ClippingPlane>
-+    inline simdscalar inside(const simdvector& v)
-+    {
-+        switch (ClippingPlane)
-+        {
-+        case FRUSTUM_LEFT:      return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
-+        case FRUSTUM_RIGHT:     return _simd_cmple_ps(v[0], v[3]);
-+        case FRUSTUM_TOP:       return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
-+        case FRUSTUM_BOTTOM:    return _simd_cmple_ps(v[1], v[3]);
-+        case FRUSTUM_NEAR:      return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
-+        case FRUSTUM_FAR:       return _simd_cmple_ps(v[2], v[3]);
-+        default:
-+            SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
-+            return _simd_setzero_ps();
-+        }
-+    }
-+
-+    template<SWR_CLIPCODES ClippingPlane>
-+    simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
-+    {
-+        simdscalari vCurIndex = _simd_setzero_si();
-+        simdscalari vOutIndex = _simd_setzero_si();
-+        simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
-+
-+        while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
-+        {
-+            simdscalari s = vCurIndex;
-+            simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
-+            simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
-+            p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
-+
-+            // gather position
-+            simdvector vInPos0, vInPos1;
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
-+                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
-+            }
-+
-+            // compute inside mask
-+            simdscalar s_in = inside<ClippingPlane>(vInPos0);
-+            simdscalar p_in = inside<ClippingPlane>(vInPos1);
-+
-+            // compute intersection mask (s_in != p_in)
-+            simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
-+            intersectMask = _simd_and_ps(intersectMask, vActiveMask);
-+
-+            // store s if inside
-+            s_in = _simd_and_ps(s_in, vActiveMask);
-+            if (!_simd_testz_ps(s_in, s_in))
-+            {
-+                // store position
-+                for (uint32_t c = 0; c < 4; ++c)
-+                {
-+                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
-+                }
-+
-+                // store attribs
-+                for (uint32_t a = 0; a < numInAttribs; ++a)
-+                {
-+                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
-+                    for (uint32_t c = 0; c < 4; ++c)
-+                    {
-+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
-+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
-+                    }
-+                }
-+
-+                // increment outIndex
-+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
-+            }
-+
-+            // compute and store intersection
-+            if (!_simd_testz_ps(intersectMask, intersectMask))
-+            {
-+                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
-+
-+                // increment outIndex for active lanes
-+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
-+            }
-+
-+            // increment loop index and update active mask
-+            vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
-+            vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
-+        }
-+
-+        return vOutIndex;
-+    }
-+
-+    template<SWR_CLIPCODES ClippingPlane>
-+    simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
-+    {
-+        simdscalari vCurIndex = _simd_setzero_si();
-+        simdscalari vOutIndex = _simd_setzero_si();
-+        simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
-+
-+        if (!_simd_testz_ps(vActiveMask, vActiveMask))
-+        {
-+            simdscalari s = vCurIndex;
-+            simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
-+
-+            // gather position
-+            simdvector vInPos0, vInPos1;
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
-+                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
-+            }
-+
-+            // compute inside mask
-+            simdscalar s_in = inside<ClippingPlane>(vInPos0);
-+            simdscalar p_in = inside<ClippingPlane>(vInPos1);
-+
-+            // compute intersection mask (s_in != p_in)
-+            simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
-+            intersectMask = _simd_and_ps(intersectMask, vActiveMask);
-+
-+            // store s if inside
-+            s_in = _simd_and_ps(s_in, vActiveMask);
-+            if (!_simd_testz_ps(s_in, s_in))
-+            {
-+                for (uint32_t c = 0; c < 4; ++c)
-+                {
-+                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
-+                }
-+
-+                // interpolate attributes and store
-+                for (uint32_t a = 0; a < numInAttribs; ++a)
-+                {
-+                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
-+                    for (uint32_t c = 0; c < 4; ++c)
-+                    {
-+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
-+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
-+                    }
-+                }
-+
-+                // increment outIndex
-+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
-+            }
-+
-+            // compute and store intersection
-+            if (!_simd_testz_ps(intersectMask, intersectMask))
-+            {
-+                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
-+
-+                // increment outIndex for active lanes
-+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
-+            }
-+
-+            // store p if inside
-+            p_in = _simd_and_ps(p_in, vActiveMask);
-+            if (!_simd_testz_ps(p_in, p_in))
-+            {
-+                for (uint32_t c = 0; c < 4; ++c)
-+                {
-+                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
-+                }
-+
-+                // interpolate attributes and store
-+                for (uint32_t a = 0; a < numInAttribs; ++a)
-+                {
-+                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
-+                    for (uint32_t c = 0; c < 4; ++c)
-+                    {
-+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
-+                        ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
-+                    }
-+                }
-+
-+                // increment outIndex
-+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
-+            }
-+        }
-+
-+        return vOutIndex;
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Vertical clipper. Clips SIMD primitives at a time
-+    /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
-+    /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
-+    /// @param numAttribs - number of valid input attribs, including position
-+    simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
-+    {
-+        // temp storage
-+        simdvertex tempVertices[7];
-+        float* pTempVerts = (float*)&tempVertices[0];
-+
-+        // zero out num input verts for non-active lanes
-+        simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
-+        vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
-+
-+        // clip prims to frustum
-+        simdscalari vNumOutPts;
-+        if (NumVertsPerPrim == 3)
-+        {
-+            vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
-+            vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-+            vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-+            vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-+            vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-+            vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-+        }
-+        else
-+        {
-+            SWR_ASSERT(NumVertsPerPrim == 2);
-+            vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
-+            vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-+            vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-+            vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-+            vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-+            vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-+        }
-+
-+        // restore num verts for non-clipped, active lanes
-+        simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
-+        vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
-+
-+        return vNumOutPts;
-+    }
-+
-+    const uint32_t workerId;
-+    const DRIVER_TYPE driverType;
-+    DRAW_CONTEXT* pDC;
-+    const API_STATE& state;
-+    simdscalar clipCodes[NumVertsPerPrim];
-+};
-+
-+
-+// pipeline stage functions
-+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
-new file mode 100644
-index 0000000..c719f27
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
-@@ -0,0 +1,444 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file context.h
-+*
-+* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
-+*        The SWR_CONTEXT is our global context and contains the DC ring,
-+*        thread state, etc.
-+*
-+*        The DRAW_CONTEXT contains all state associated with a draw operation.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include <condition_variable>
-+#include <algorithm>
-+
-+#include "core/api.h"
-+#include "core/utils.h"
-+#include "core/arena.h"
-+#include "core/fifo.hpp"
-+#include "core/knobs.h"
-+#include "common/simdintrin.h"
-+#include "core/threads.h"
-+
-+// x.8 fixed point precision values
-+#define FIXED_POINT_SHIFT 8
-+#define FIXED_POINT_SCALE 256
-+
-+// x.16 fixed point precision values
-+#define FIXED_POINT16_SHIFT 16
-+#define FIXED_POINT16_SCALE 65536
-+
-+struct SWR_CONTEXT;
-+struct DRAW_CONTEXT;
-+
-+struct TRI_FLAGS
-+{
-+    uint32_t frontFacing : 1;
-+    uint32_t yMajor : 1;
-+    uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
-+    uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
-+    uint32_t primID;
-+    uint32_t renderTargetArrayIndex;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_TRIANGLE_DESC
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_TRIANGLE_DESC
-+{
-+    float I[3];
-+    float J[3];
-+    float Z[3];
-+    float OneOverW[3];
-+    float recipDet;
-+
-+    float *pAttribs;
-+    float *pPerspAttribs;
-+    float *pSamplePos;
-+    float *pUserClipBuffer;
-+
-+    uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
-+
-+    TRI_FLAGS triFlags;
-+};
-+
-+struct TRIANGLE_WORK_DESC
-+{
-+    float *pTriBuffer;
-+    float *pAttribs;
-+    float *pUserClipBuffer;
-+    uint32_t numAttribs;
-+    TRI_FLAGS triFlags;
-+};
-+
-+union CLEAR_FLAGS
-+{
-+    struct
-+    {
-+        uint32_t mask : 3;
-+    };
-+    uint32_t bits;
-+};
-+
-+struct CLEAR_DESC
-+{
-+    CLEAR_FLAGS flags;
-+    float clearRTColor[4];  // RGBA_32F
-+    float clearDepth;   // [0..1]
-+    BYTE clearStencil;
-+};
-+
-+struct INVALIDATE_TILES_DESC
-+{
-+    uint32_t attachmentMask;
-+};
-+
-+struct SYNC_DESC
-+{
-+    PFN_CALLBACK_FUNC pfnCallbackFunc;
-+    uint64_t userData;
-+    uint64_t userData2;
-+};
-+
-+struct QUERY_DESC
-+{
-+    SWR_STATS* pStats;
-+};
-+
-+struct STORE_TILES_DESC
-+{
-+    SWR_RENDERTARGET_ATTACHMENT attachment;
-+    SWR_TILE_STATE postStoreTileState;
-+};
-+
-+struct COMPUTE_DESC
-+{
-+    uint32_t threadGroupCountX;
-+    uint32_t threadGroupCountY;
-+    uint32_t threadGroupCountZ;
-+};
-+
-+typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
-+
-+enum WORK_TYPE
-+{
-+    SYNC,
-+    DRAW,
-+    CLEAR,
-+    INVALIDATETILES,
-+    STORETILES,
-+    QUERYSTATS,
-+};
-+
-+struct BE_WORK
-+{
-+    WORK_TYPE type;
-+    PFN_WORK_FUNC pfnWork;
-+    union
-+    {
-+        SYNC_DESC sync;
-+        TRIANGLE_WORK_DESC tri;
-+        CLEAR_DESC clear;
-+        INVALIDATE_TILES_DESC invalidateTiles;
-+        STORE_TILES_DESC storeTiles;
-+        QUERY_DESC queryStats;
-+    } desc;
-+};
-+
-+struct DRAW_WORK
-+{
-+    DRAW_CONTEXT*   pDC;
-+    union
-+    {
-+        uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
-+        uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
-+    };
-+    union
-+    {
-+        const int32_t* pIB;        // DrawIndexed: App supplied indices
-+        uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
-+    };
-+    int32_t    baseVertex;
-+    uint32_t   numInstances;        // Number of instances
-+    uint32_t   startInstance;       // Instance offset
-+    uint32_t   startPrimID;         // starting primitiveID for this draw batch
-+    SWR_FORMAT type;                // index buffer type
-+};
-+
-+typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
-+struct FE_WORK
-+{
-+    WORK_TYPE type;
-+    PFN_FE_WORK_FUNC pfnWork;
-+    union
-+    {
-+        SYNC_DESC sync;
-+        DRAW_WORK draw;
-+        CLEAR_DESC clear;
-+        INVALIDATE_TILES_DESC invalidateTiles;
-+        STORE_TILES_DESC storeTiles;
-+        QUERY_DESC queryStats;
-+    } desc;
-+};
-+
-+struct GUARDBAND
-+{
-+    float left, right, top, bottom;
-+};
-+
-+struct PA_STATE;
-+
-+// function signature for pipeline stages that execute after primitive assembly
-+typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 
-+    uint32_t primMask, simdscalari primID);
-+
-+OSALIGNLINE(struct) API_STATE
-+{
-+    // Vertex Buffers
-+    SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
-+
-+    // Index Buffer
-+    SWR_INDEX_BUFFER_STATE  indexBuffer;
-+
-+    // FS - Fetch Shader State
-+    PFN_FETCH_FUNC          pfnFetchFunc;
-+
-+    // VS - Vertex Shader State
-+    PFN_VERTEX_FUNC         pfnVertexFunc;
-+
-+    // GS - Geometry Shader State
-+    PFN_GS_FUNC             pfnGsFunc;
-+    SWR_GS_STATE            gsState;
-+
-+    // CS - Compute Shader
-+    PFN_CS_FUNC             pfnCsFunc;
-+    uint32_t                totalThreadsInGroup;
-+
-+    // FE - Frontend State
-+    SWR_FRONTEND_STATE      frontendState;
-+
-+    // SOS - Streamout Shader State
-+    PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];
-+
-+    // Streamout state
-+    SWR_STREAMOUT_STATE     soState;
-+    mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
-+
-+    // Tessellation State
-+    PFN_HS_FUNC             pfnHsFunc;
-+    PFN_DS_FUNC             pfnDsFunc;
-+    SWR_TS_STATE            tsState;
-+
-+    // Specifies which VS outputs are sent to PS.
-+    // Does not include position
-+    uint32_t                linkageMask; 
-+    uint32_t                linkageCount;
-+    uint8_t                 linkageMap[MAX_ATTRIBUTES];
-+
-+    // attrib mask, specifies the total set of attributes used
-+    // by the frontend (vs, so, gs)
-+    uint32_t                feAttribMask;
-+
-+    PRIMITIVE_TOPOLOGY      topology;
-+    bool                    forceFront;
-+
-+    // RS - Rasterizer State
-+    SWR_RASTSTATE           rastState;
-+    // floating point multisample offsets
-+    float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
-+
-+    GUARDBAND               gbState;
-+
-+    SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
-+    SWR_VIEWPORT_MATRIX     vpMatrix[KNOB_NUM_VIEWPORTS_SCISSORS];
-+
-+    BBOX                    scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
-+    BBOX                    scissorInFixedPoint;
-+
-+    // Backend state
-+    SWR_BACKEND_STATE       backendState;
-+
-+    // PS - Pixel shader state
-+    SWR_PS_STATE            psState;
-+
-+    SWR_DEPTH_STENCIL_STATE depthStencilState;
-+
-+    // OM - Output Merger State
-+    SWR_BLEND_STATE         blendState;
-+    PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];
-+
-+    // Stats are incremented when this is true.
-+    bool enableStats;
-+};
-+
-+class MacroTileMgr;
-+class DispatchQueue;
-+
-+struct RenderOutputBuffers
-+{
-+    uint8_t* pColor[SWR_NUM_RENDERTARGETS];
-+    uint8_t* pDepth;
-+    uint8_t* pStencil;
-+};
-+
-+// pipeline function pointer types
-+typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
-+
-+// Draw State
-+struct DRAW_STATE
-+{
-+    API_STATE state;
-+
-+    void* pPrivateState;  // Its required the driver sets this up for each draw.
-+
-+    // pipeline function pointers, filled in by API thread when setting up the draw
-+    PFN_BACKEND_FUNC pfnBackend;
-+    PFN_PROCESS_PRIMS pfnProcessPrims;
-+
-+    Arena    arena;     // This should only be used by API thread.
-+};
-+
-+// Draw Context
-+//    The api thread sets up a draw context that exists for the life of the draw.
-+//    This draw context maintains all of the state needed for the draw operation.
-+struct DRAW_CONTEXT
-+{
-+    SWR_CONTEXT *pContext;
-+
-+    uint64_t drawId;
-+
-+    bool isCompute;    // Is this DC a compute context?
-+
-+    FE_WORK FeWork;
-+    volatile OSALIGNLINE(uint32_t) FeLock;
-+    volatile OSALIGNLINE(bool) inUse;
-+    volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
-+
-+    uint64_t dependency;
-+
-+    MacroTileMgr* pTileMgr;
-+
-+    // The following fields are valid if isCompute is true.
-+    volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done?   (isCompute)
-+    DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
-+
-+    DRAW_STATE* pState;
-+    Arena    arena;
-+};
-+
-+INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
-+{
-+    SWR_ASSERT(pDC != nullptr);
-+    SWR_ASSERT(pDC->pState != nullptr);
-+
-+    return pDC->pState->state;
-+}
-+
-+INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
-+{
-+    SWR_ASSERT(pDC != nullptr);
-+    SWR_ASSERT(pDC->pState != nullptr);
-+
-+    return pDC->pState->pPrivateState;
-+}
-+
-+class HotTileMgr;
-+
-+struct SWR_CONTEXT
-+{
-+    // Draw Context Ring
-+    //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
-+    //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
-+    //  of draws that can be in flight at any given time.
-+    //
-+    //  Description:
-+    //  1. State - When an application first sets state we'll request a new draw context to use.
-+    //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
-+    //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
-+    //     c. All state calls set state on pCurDrawContext.
-+    //  2. Draw - Creates submits a work item that is associated with current draw context.
-+    //     a. Set pPrevDrawContext = pCurDrawContext
-+    //     b. Set pCurDrawContext to NULL.
-+    //  3. State - When an applications sets state after draw
-+    //     a. Same as step 1.
-+    //     b. State is copied from prev draw context to current.
-+    DRAW_CONTEXT* dcRing;
-+
-+    DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
-+    DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
-+
-+    // Draw State Ring
-+    //  When draw are very large (lots of primitives) then the API thread will break these up.
-+    //  These split draws all have identical state. So instead of storing the state directly
-+    //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
-+    //  to reference a single entry in the DS ring.
-+    DRAW_STATE*   dsRing;
-+
-+    uint32_t curStateId;               // Current index to the next available entry in the DS ring.
-+
-+    uint32_t NumWorkerThreads;
-+
-+    THREAD_POOL threadPool; // Thread pool associated with this context
-+
-+    std::condition_variable FifosNotEmpty;
-+    std::mutex WaitLock;
-+
-+    // Draw Contexts will get a unique drawId generated from this
-+    uint64_t nextDrawId;
-+
-+    // Last retired drawId. Read/written only be API thread
-+    uint64_t LastRetiredId;
-+
-+    // most recent draw id enqueued by the API thread
-+    // written by api thread, read by multiple workers
-+    OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-+
-+    // Current FE status of each worker.
-+    OSALIGNLINE(volatile uint64_t) WorkerFE[KNOB_MAX_NUM_THREADS];
-+    OSALIGNLINE(volatile uint64_t) WorkerBE[KNOB_MAX_NUM_THREADS];
-+
-+    DRIVER_TYPE driverType;
-+
-+    uint32_t privateStateSize;
-+
-+    HotTileMgr *pHotTileMgr;
-+
-+    // tile load/store functions, passed in at create context time
-+    PFN_LOAD_TILE pfnLoadTile;
-+    PFN_STORE_TILE pfnStoreTile;
-+    PFN_CLEAR_TILE pfnClearTile;
-+
-+    // Global Stats
-+    SWR_STATS stats[KNOB_MAX_NUM_THREADS];
-+
-+    // Scratch space for workers.
-+    uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
-+};
-+
-+void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
-+void WakeAllThreads(SWR_CONTEXT *pContext);
-+
-+#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name += count; }
-+#define SET_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name = count; }
-diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
-new file mode 100644
-index 0000000..9f869ec
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
-@@ -0,0 +1,215 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file depthstencil.h
-+*
-+* @brief Implements depth/stencil functionality
-+*
-+******************************************************************************/
-+#pragma once
-+#include "common/os.h"
-+#include "format_conversion.h"
-+
-+INLINE
-+void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps)
-+{
-+    simdscalari stencil = _simd_castps_si(stencilps);
-+
-+    switch (op)
-+    {
-+    case STENCILOP_KEEP:
-+        break;
-+    case STENCILOP_ZERO:
-+        stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
-+        break;
-+    case STENCILOP_REPLACE:
-+        stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
-+        break;
-+    case STENCILOP_INCRSAT:
-+    {
-+        simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
-+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
-+        break;
-+    }
-+    case STENCILOP_DECRSAT:
-+    {
-+        simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
-+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
-+        break;
-+    }
-+    case STENCILOP_INCR:
-+    {
-+        simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
-+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
-+        break;
-+    }
-+    case STENCILOP_DECR:
-+    {
-+        simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
-+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
-+        break;
-+    }
-+    case STENCILOP_INVERT:
-+    {
-+        simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
-+        stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
-+        break;
-+    }
-+    default:
-+        break;
-+    }
-+}
-+
-+
-+INLINE
-+simdscalar ZTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-+                 bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar mask, BYTE *pStencilBase,
-+                 bool testOnly)
-+{
-+    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-+    static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
-+
-+    simdscalar depthResult = _simd_set1_ps(-1.0f);
-+    simdscalar zbuf;
-+
-+    // clamp Z to viewport [minZ..maxZ]
-+    simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
-+    simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
-+    interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
-+    
-+    if (pDSState->depthTestEnable)
-+    {
-+        switch (pDSState->depthTestFunc)
-+        {
-+        case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break;
-+        case ZFUNC_ALWAYS: break;
-+        default:
-+            zbuf = _simd_load_ps((const float*)pDepthBase);
-+        }
-+
-+        switch (pDSState->depthTestFunc)
-+        {
-+        case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break;
-+        case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break;
-+        case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break;
-+        case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break;
-+        case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break;
-+        }
-+    }
-+
-+    simdscalar stencilMask = _simd_set1_ps(-1.0f);
-+    simdscalar stencilbuf;
-+
-+    uint8_t stencilRefValue;
-+    uint32_t stencilTestFunc;
-+    uint32_t stencilFailOp;
-+    uint32_t stencilPassDepthPassOp;
-+    uint32_t stencilPassDepthFailOp;
-+    uint8_t stencilTestMask;
-+    uint8_t stencilWriteMask;
-+    if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
-+    {
-+        stencilRefValue = pDSState->stencilRefValue;
-+        stencilTestFunc = pDSState->stencilTestFunc;
-+        stencilFailOp = pDSState->stencilFailOp;
-+        stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
-+        stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
-+        stencilTestMask = pDSState->stencilTestMask;
-+        stencilWriteMask = pDSState->stencilWriteMask;
-+    }
-+    else
-+    {
-+        stencilRefValue = pDSState->backfaceStencilRefValue;
-+        stencilTestFunc = pDSState->backfaceStencilTestFunc;
-+        stencilFailOp = pDSState->backfaceStencilFailOp;
-+        stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
-+        stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
-+        stencilTestMask = pDSState->backfaceStencilTestMask;
-+        stencilWriteMask = pDSState->backfaceStencilWriteMask;
-+    }
-+
-+    if (pDSState->stencilTestEnable)
-+    {
-+        simdvector sbuf;
-+        LoadSOA<R8_UINT>(pStencilBase, sbuf);
-+        stencilbuf = sbuf.v[0];
-+
-+        // apply stencil read mask
-+        simdscalar stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
-+
-+        // do stencil compare in float to avoid simd integer emulation in AVX1
-+        stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
-+
-+        simdscalar stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
-+
-+        switch (stencilTestFunc)
-+        {
-+        case ZFUNC_ALWAYS: break;
-+        case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break;
-+        case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break;
-+        case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break;
-+        case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break;
-+        case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break;
-+        case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break;
-+        case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break;
-+        }
-+    }
-+
-+    simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
-+    depthWriteMask = _simd_and_ps(depthWriteMask, mask);
-+
-+    if (testOnly) {
-+        return depthWriteMask;
-+    }
-+
-+    if (pDSState->depthWriteEnable)
-+    {
-+        _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(depthWriteMask), interpZ);
-+    }
-+
-+    if (pDSState->stencilWriteEnable)
-+    {
-+        simdscalar stencilps = stencilbuf;
-+        simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
-+
-+        simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, mask);
-+        simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthResult);
-+        simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthResult, _simd_set1_ps(-1)));
-+
-+        simdscalar origStencil = stencilps;
-+
-+        StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
-+        StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps);
-+        StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps);
-+
-+        // apply stencil write mask
-+        simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
-+        stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
-+        stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
-+
-+        simdvector stencilResult;
-+        stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, mask);
-+        StoreSOA<R8_UINT>(stencilResult, pStencilBase);
-+    }
-+
-+    return depthWriteMask;
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
-new file mode 100644
-index 0000000..238f5ee
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
-@@ -0,0 +1,144 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file fifo.hpp
-+*
-+* @brief Definitions for our fifos used for thread communication.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/os.h"
-+#include <vector>
-+#include <cassert>
-+
-+template<class T>
-+struct QUEUE
-+{
-+    OSALIGNLINE(volatile uint32_t) mLock;
-+    OSALIGNLINE(volatile uint32_t) mNumEntries;
-+    std::vector<T*> mBlocks;
-+    T* mCurBlock;
-+    uint32_t mHead;
-+    uint32_t mTail;    uint32_t mCurBlockIdx;
-+
-+    // power of 2
-+    static const uint32_t mBlockSizeShift = 6;
-+    static const uint32_t mBlockSize = 1 << mBlockSizeShift;
-+
-+    void initialize()
-+    {
-+        mLock = 0;
-+        mHead = 0;
-+        mTail = 0;
-+        mNumEntries = 0;
-+        mCurBlock = (T*)malloc(mBlockSize*sizeof(T));
-+        mBlocks.push_back(mCurBlock);
-+        mCurBlockIdx = 0;
-+    }
-+
-+    void clear()
-+    {
-+        mHead = 0;
-+        mTail = 0;
-+        mCurBlock = mBlocks[0];
-+        mCurBlockIdx = 0;
-+
-+        mNumEntries = 0;
-+        _ReadWriteBarrier();
-+        mLock = 0;
-+    }
-+
-+    uint32_t getNumQueued()
-+    {
-+        return mNumEntries;
-+    }
-+
-+    bool tryLock()
-+    {
-+        if (mLock)
-+        {
-+            return false;
-+        }
-+
-+        // try to lock the FIFO
-+        LONG initial = InterlockedCompareExchange(&mLock, 1, 0);
-+        return (initial == 0);
-+    }
-+        
-+    void unlock()
-+    {
-+        mLock = 0;
-+    }
-+
-+    T* peek()
-+    {
-+        if (mNumEntries == 0)
-+        {
-+            return nullptr;
-+        }
-+        uint32_t block = mHead >> mBlockSizeShift;
-+        return &mBlocks[block][mHead & (mBlockSize-1)];
-+    }
-+
-+    void dequeue_noinc()
-+    {
-+        mHead ++;
-+        mNumEntries --;
-+    }
-+
-+    bool enqueue_try_nosync(const T* entry)
-+    {
-+        memcpy(&mCurBlock[mTail], entry, sizeof(T));
-+
-+        mTail ++;
-+        if (mTail == mBlockSize)
-+        {
-+            if (++mCurBlockIdx < mBlocks.size())
-+            {
-+                mCurBlock = mBlocks[mCurBlockIdx];
-+            }
-+            else
-+            {
-+                T* newBlock = (T*)malloc(sizeof(T)*mBlockSize);
-+                SWR_ASSERT(newBlock);
-+
-+                mBlocks.push_back(newBlock);
-+                mCurBlock = newBlock;
-+            }
-+
-+            mTail = 0;
-+        }
-+
-+        mNumEntries ++;
-+        return true;
-+    }
-+
-+    void destroy()
-+    {
-+        for (uint32_t i = 0; i < mBlocks.size(); ++i)
-+        {
-+            free(mBlocks[i]);
-+        }
-+    }
-+
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
-new file mode 100644
-index 0000000..af57697
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
-@@ -0,0 +1,167 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file format_conversion.h
-+*
-+* @brief API implementation
-+*
-+******************************************************************************/
-+#include "format_types.h"
-+#include "format_traits.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Load SIMD packed pixels in SOA format and converts to
-+///        SOA RGBA32_FLOAT format.
-+/// @param pSrc - source data in SOA form
-+/// @param dst - output data in SOA form
-+template<SWR_FORMAT SrcFormat>
-+INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
-+{
-+    // fast path for float32
-+    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
-+    {
-+        auto lambda = [&](int comp)
-+        {
-+            simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar)));
-+
-+            dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
-+        };
-+
-+        UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
-+        return;
-+    }
-+
-+    auto lambda = [&](int comp)
-+    {
-+        // load SIMD components
-+        simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc);
-+
-+        // unpack
-+        vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
-+
-+        // convert
-+        if (FormatTraits<SrcFormat>::isNormalized(comp))
-+        {
-+            vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp));
-+            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
-+        }
-+
-+        dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
-+
-+        pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
-+    };
-+
-+    UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Convert and store simdvector of pixels in SOA
-+///        RGBA32_FLOAT to SOA format
-+/// @param src - source data in SOA form
-+/// @param dst - output data in SOA form
-+template<SWR_FORMAT DstFormat>
-+INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
-+{
-+    // fast path for float32
-+    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
-+    {
-+        for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
-+        {
-+            simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-+
-+            // Gamma-correct
-+            if (FormatTraits<DstFormat>::isSRGB)
-+            {
-+                if (comp < 3)  // Input format is always RGBA32_FLOAT.
-+                {
-+                    vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
-+                }
-+            }
-+
-+            _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp);
-+        }
-+        return;
-+    }
-+
-+    auto lambda = [&](int comp)
-+    {
-+        simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-+
-+        // Gamma-correct
-+        if (FormatTraits<DstFormat>::isSRGB)
-+        {
-+            if (comp < 3)  // Input format is always RGBA32_FLOAT.
-+            {
-+                vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
-+            }
-+        }
-+
-+        // convert
-+        if (FormatTraits<DstFormat>::isNormalized(comp))
-+        {
-+            if (FormatTraits<DstFormat>::GetType(comp) == SWR_TYPE_UNORM)
-+            {
-+                vComp = _simd_max_ps(vComp, _simd_setzero_ps());
-+            }
-+
-+            if (FormatTraits<DstFormat>::GetType(comp) == SWR_TYPE_SNORM)
-+            {
-+                vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f));
-+            }
-+            vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f));
-+
-+            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(comp)));
-+            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
-+        }
-+        else if (FormatTraits<DstFormat>::GetBPC(comp) < 32)
-+        {
-+            if (FormatTraits<DstFormat>::GetType(comp) == SWR_TYPE_UINT)
-+            {
-+                int iMax = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;
-+                int iMin = 0;
-+                simdscalari vCompi = _simd_castps_si(vComp);
-+                vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
-+                vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
-+                vComp = _simd_castsi_ps(vCompi);
-+            }
-+            else if (FormatTraits<DstFormat>::GetType(comp) == SWR_TYPE_SINT)
-+            {
-+                int iMax = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
-+                int iMin = -1 - iMax;
-+                simdscalari vCompi = _simd_castps_si(vComp);
-+                vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
-+                vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
-+                vComp = _simd_castsi_ps(vCompi);
-+            }
-+        }
-+
-+        // pack
-+        vComp = FormatTraits<DstFormat>::pack(comp, vComp);
-+
-+        // store
-+        FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
-+
-+        pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
-+    };
-+
-+    UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
-new file mode 100644
-index 0000000..d39f523
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
-@@ -0,0 +1,2954 @@
-+
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file format_traits.h
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#pragma once
-+
-+#include "format_types.h"
-+#include "utils.h"
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatSwizzle - Component swizzle selects
-+//////////////////////////////////////////////////////////////////////////
-+template<UINT comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0>
-+struct FormatSwizzle
-+{
-+    // Return swizzle select for component.
-+    INLINE static uint32_t swizzle(UINT c)
-+    {
-+        static const uint32_t s[4] = { comp0, comp1, comp2, comp3 };
-+        return s[c];
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits - Format traits
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT format>
-+struct FormatTraits :
-+    ComponentTraits<SWR_TYPE_UNKNOWN, 0>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0>
-+{
-+    static const uint32_t bpp{ 0 };
-+    static const uint32_t numComps{ 0 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{1};
-+    static const uint32_t bcHeight{1};
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32A32_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32_32 TransposeT;
-+    typedef Format4<32, 32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32A32_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32_32 TransposeT;
-+    typedef Format4<32, 32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32A32_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32_32 TransposeT;
-+    typedef Format4<32, 32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32X32_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32_32 TransposeT;
-+    typedef Format4<32, 32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32A32_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32_32 TransposeT;
-+    typedef Format4<32, 32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32A32_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32_32 TransposeT;
-+    typedef Format4<32, 32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 96 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32 TransposeT;
-+    typedef Format3<32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 96 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32 TransposeT;
-+    typedef Format3<32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 96 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32 TransposeT;
-+    typedef Format3<32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 96 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32 TransposeT;
-+    typedef Format3<32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32B32_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 96 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32_32 TransposeT;
-+    typedef Format3<32, 32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16A16_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16A16_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16A16_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16A16_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16A16_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32 TransposeT;
-+    typedef Format2<32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32 TransposeT;
-+    typedef Format2<32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32 TransposeT;
-+    typedef Format2<32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32 TransposeT;
-+    typedef Format2<32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16X16_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNUSED, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16X16_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_UNUSED, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16A16_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16A16_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16_16 TransposeT;
-+    typedef Format4<16, 16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32 TransposeT;
-+    typedef Format2<32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32G32_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32 TransposeT;
-+    typedef Format2<32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS_LD
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose32_32 TransposeT;
-+    typedef Format2<32, 32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B8G8R8A8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B8G8R8A8_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10A2_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10A2_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10A2_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8A8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8A8_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8A8_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8A8_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8A8_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16 TransposeT;
-+    typedef Format2<16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16 TransposeT;
-+    typedef Format2<16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16 TransposeT;
-+    typedef Format2<16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16 TransposeT;
-+    typedef Format2<16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16 TransposeT;
-+    typedef Format2<16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10A2_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10A2_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R11G11B10_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose11_11_10 TransposeT;
-+    typedef Format3<11, 11, 10> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 32>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 32>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R24_UNORM_X8_TYPELESS> :
-+    ComponentTraits<SWR_TYPE_UNORM, 24>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<24> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R24_UNORM_X8_TYPELESS_LD> - Format traits specialization for R24_UNORM_X8_TYPELESS_LD
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R24_UNORM_X8_TYPELESS_LD> :
-+    ComponentTraits<SWR_TYPE_UNORM, 24>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<24> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<A32_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 32>,
-+    FormatSwizzle<3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B8G8R8X8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B8G8R8X8_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8X8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8X8_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R9G9B9E5_SHAREDEXP> :
-+    ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose9_9_9_5 TransposeT;
-+    typedef Format4<9, 9, 9, 5> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10X2_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNUSED, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10X2_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_UNUSED, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8A8_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8A8_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16 TransposeT;
-+    typedef Format2<16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16 TransposeT;
-+    typedef Format2<16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 32>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R32_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 32>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<32> TransposeT;
-+    typedef Format1<32> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B5G6R5_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
-+    FormatSwizzle<2, 1, 0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose5_6_5 TransposeT;
-+    typedef Format3<5, 6, 5> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B5G6R5_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
-+    FormatSwizzle<2, 1, 0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose5_6_5 TransposeT;
-+    typedef Format3<5, 6, 5> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B5G5R5A1_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose5_5_5_1 TransposeT;
-+    typedef Format4<5, 5, 5, 1> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B5G5R5A1_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose5_5_5_1 TransposeT;
-+    typedef Format4<5, 5, 5, 1> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B4G4R4A4_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose4_4_4_4 TransposeT;
-+    typedef Format4<4, 4, 4, 4> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B4G4R4A4_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose4_4_4_4 TransposeT;
-+    typedef Format4<4, 4, 4, 4> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8 TransposeT;
-+    typedef Format2<8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8 TransposeT;
-+    typedef Format2<8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8 TransposeT;
-+    typedef Format2<8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8 TransposeT;
-+    typedef Format2<8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 16>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 16>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 16>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 16>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 16>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<A16_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 16>,
-+    FormatSwizzle<3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<A16_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 16>,
-+    FormatSwizzle<3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B5G5R5X1_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose5_5_5_1 TransposeT;
-+    typedef Format4<5, 5, 5, 1> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B5G5R5X1_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose5_5_5_1 TransposeT;
-+    typedef Format4<5, 5, 5, 1> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8 TransposeT;
-+    typedef Format2<8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-+    FormatSwizzle<0, 1>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 2 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8 TransposeT;
-+    typedef Format2<8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 16>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 16>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 16 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<16> TransposeT;
-+    typedef Format1<16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 8 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<8> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 8 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<8> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 8 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<8> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 8 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<8> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<A8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 8 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<8> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 8 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<8> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 8 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef TransposeSingleComponent<8> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<YCRCB_SWAPUVY> :
-+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ true };
-+    static const uint32_t bcWidth{ 2 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC1_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<64> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC2_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC3_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC4_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<64> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC5_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC1_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<64> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC2_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC3_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<YCRCB_SWAPUV> :
-+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ true };
-+    static const uint32_t bcWidth{ 2 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8_8 TransposeT;
-+    typedef Format4<8, 8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 24 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8 TransposeT;
-+    typedef Format3<8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 24 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8 TransposeT;
-+    typedef Format3<8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 24 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8 TransposeT;
-+    typedef Format3<8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 24 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8 TransposeT;
-+    typedef Format3<8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC4_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 64 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<64> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC5_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16_FLOAT> :
-+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 48 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16 TransposeT;
-+    typedef Format3<16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 48 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16 TransposeT;
-+    typedef Format3<16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 48 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16 TransposeT;
-+    typedef Format3<16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 48 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16 TransposeT;
-+    typedef Format3<16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 48 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16 TransposeT;
-+    typedef Format3<16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC7_UNORM> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<BC7_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 128 };
-+    static const uint32_t numComps{ 1 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ true };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 4 };
-+    static const uint32_t bcHeight{ 4 };
-+
-+    typedef TransposeSingleComponent<128> TransposeT;
-+    typedef Format1<8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8_UNORM_SRGB> :
-+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 24 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ true };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8 TransposeT;
-+    typedef Format3<8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 48 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16 TransposeT;
-+    typedef Format3<16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R16G16B16_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 48 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose16_16_16 TransposeT;
-+    typedef Format3<16, 16, 16> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10A2_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10A2_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10A2_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R10G10B10A2_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
-+    FormatSwizzle<0, 1, 2, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10A2_SNORM> :
-+    ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10A2_USCALED> :
-+    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10A2_SSCALED> :
-+    ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x3f800000>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10A2_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<B10G10R10A2_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
-+    FormatSwizzle<2, 1, 0, 3>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 32 };
-+    static const uint32_t numComps{ 4 };
-+    static const bool hasAlpha{ true };
-+    static const uint32_t alphaComp{ 3 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose10_10_10_2 TransposeT;
-+    typedef Format4<10, 10, 10, 2> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8_UINT> :
-+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 24 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8 TransposeT;
-+    typedef Format3<8, 8, 8> FormatT;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct FormatTraits<R8G8B8_SINT> :
-+    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-+    FormatSwizzle<0, 1, 2>,
-+    Defaults<0, 0, 0, 0x1>
-+{
-+    static const uint32_t bpp{ 24 };
-+    static const uint32_t numComps{ 3 };
-+    static const bool hasAlpha{ false };
-+    static const uint32_t alphaComp{ 0 };
-+    static const bool isSRGB{ false };
-+    static const bool isBC{ false };
-+    static const bool isSubsampled{ false };
-+    static const uint32_t bcWidth{ 1 };
-+    static const uint32_t bcHeight{ 1 };
-+
-+    typedef Transpose8_8_8 TransposeT;
-+    typedef Format3<8, 8, 8> FormatT;
-+};
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
-new file mode 100644
-index 0000000..92125df
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
-@@ -0,0 +1,1053 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file formats.h
-+*
-+* @brief Definitions for SWR_FORMAT functions.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PackTraits - Helpers for packing / unpacking same pixel sizes
-+//////////////////////////////////////////////////////////////////////////
-+template <uint32_t NumBits, bool Signed = false>
-+struct PackTraits
-+{
-+    static const uint32_t MyNumBits = NumBits;
-+    static simdscalar loadSOA(const BYTE *pSrc) = delete;
-+    static void storeSOA(BYTE *pDst, simdscalar src) = delete;
-+    static simdscalar unpack(simdscalar &in) = delete;
-+    static simdscalar pack(simdscalar &in) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PackTraits - Helpers for packing / unpacking unused channels
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct PackTraits<0, false>
-+{
-+    static const uint32_t MyNumBits = 0;
-+
-+    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
-+    static void storeSOA(BYTE *pDst, simdscalar src) { return; }
-+    static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
-+    static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
-+};
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct PackTraits<8, false>
-+{
-+    static const uint32_t MyNumBits = 8;
-+
-+    static simdscalar loadSOA(const BYTE *pSrc)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        __m256 result = _mm256_setzero_ps();
-+        __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
-+        return _mm256_insertf128_ps(result, vLo, 0);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static void storeSOA(BYTE *pDst, simdscalar src)
-+    {
-+        // store simd bytes
-+#if KNOB_SIMD_WIDTH == 8
-+        _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar unpack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+#if KNOB_ARCH==KNOB_ARCH_AVX
-+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
-+        __m128i resLo = _mm_cvtepu8_epi32(src);
-+        __m128i resHi = _mm_shuffle_epi8(src,
-+            _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
-+
-+        __m256i result = _mm256_castsi128_si256(resLo);
-+        result = _mm256_insertf128_si256(result, resHi, 1);
-+        return _mm256_castsi256_ps(result);
-+#elif KNOB_ARCH==KNOB_ARCH_AVX2
-+        return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-+#endif
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar pack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalari src = _simd_castps_si(in);
-+        __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
-+        __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
-+        return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PackTraits - Helpers for packing / unpacking 8 bit signed channels
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct PackTraits<8, true>
-+{
-+    static const uint32_t MyNumBits = 8;
-+
-+    static simdscalar loadSOA(const BYTE *pSrc)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        __m256 result = _mm256_setzero_ps();
-+        __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
-+        return _mm256_insertf128_ps(result, vLo, 0);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static void storeSOA(BYTE *pDst, simdscalar src)
-+    {
-+        // store simd bytes
-+#if KNOB_SIMD_WIDTH == 8
-+        _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar unpack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+#if KNOB_ARCH==KNOB_ARCH_AVX
-+        SWR_ASSERT(0); // I think this may be incorrect.
-+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
-+        __m128i resLo = _mm_cvtepi8_epi32(src);
-+        __m128i resHi = _mm_shuffle_epi8(src,
-+            _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
-+
-+        __m256i result = _mm256_castsi128_si256(resLo);
-+        result = _mm256_insertf128_si256(result, resHi, 1);
-+        return _mm256_castsi256_ps(result);
-+#elif KNOB_ARCH==KNOB_ARCH_AVX2
-+        return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-+#endif
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar pack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalari src = _simd_castps_si(in);
-+        __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
-+        __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
-+        return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct PackTraits<16, false>
-+{
-+    static const uint32_t MyNumBits = 16;
-+
-+    static simdscalar loadSOA(const BYTE *pSrc)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        __m256 result = _mm256_setzero_ps();
-+        __m128 vLo = _mm_load_ps((const float*)pSrc);
-+        return _mm256_insertf128_ps(result, vLo, 0);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static void storeSOA(BYTE *pDst, simdscalar src)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        // store 16B (2B * 8)
-+        _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar unpack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+#if KNOB_ARCH==KNOB_ARCH_AVX
-+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
-+        __m128i resLo = _mm_cvtepu16_epi32(src);
-+        __m128i resHi = _mm_shuffle_epi8(src,
-+            _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
-+
-+        __m256i result = _mm256_castsi128_si256(resLo);
-+        result = _mm256_insertf128_si256(result, resHi, 1);
-+        return _mm256_castsi256_ps(result);
-+#elif KNOB_ARCH==KNOB_ARCH_AVX2
-+        return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-+#endif
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar pack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalari src = _simd_castps_si(in);
-+        __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
-+        return _mm256_castsi256_ps(res);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PackTraits - Helpers for packing / unpacking 16 bit signed channels
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct PackTraits<16, true>
-+{
-+    static const uint32_t MyNumBits = 16;
-+
-+    static simdscalar loadSOA(const BYTE *pSrc)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        __m256 result = _mm256_setzero_ps();
-+        __m128 vLo = _mm_load_ps((const float*)pSrc);
-+        return _mm256_insertf128_ps(result, vLo, 0);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static void storeSOA(BYTE *pDst, simdscalar src)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        // store 16B (2B * 8)
-+        _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar unpack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+#if KNOB_ARCH==KNOB_ARCH_AVX
-+        SWR_ASSERT(0); // I think this is incorrectly implemented
-+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
-+        __m128i resLo = _mm_cvtepi16_epi32(src);
-+        __m128i resHi = _mm_shuffle_epi8(src,
-+            _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
-+
-+        __m256i result = _mm256_castsi128_si256(resLo);
-+        result = _mm256_insertf128_si256(result, resHi, 1);
-+        return _mm256_castsi256_ps(result);
-+#elif KNOB_ARCH==KNOB_ARCH_AVX2
-+        return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
-+#endif
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar pack(simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalari src = _simd_castps_si(in);
-+        __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
-+        return _mm256_castsi256_ps(res);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PackTraits - Helpers for packing / unpacking 32 bit channels
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct PackTraits<32, false>
-+{
-+    static const uint32_t MyNumBits = 32;
-+
-+    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
-+    static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
-+    static simdscalar unpack(simdscalar &in) { return in; }
-+    static simdscalar pack(simdscalar &in) { return in; }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits.
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_TYPE type, uint32_t NumBits>
-+struct TypeTraits : PackTraits<NumBits>
-+{
-+    static const SWR_TYPE MyType = type;
-+    static float toFloat() { return 0.0; }
-+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UINT8
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_UINT;
-+    static float toFloat() { return 0.0; }
-+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UINT8
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_SINT;
-+    static float toFloat() { return 0.0; }
-+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UINT16
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_UINT;
-+    static float toFloat() { return 0.0; }
-+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for SINT16
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_SINT;
-+    static float toFloat() { return 0.0; }
-+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UINT32
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_UINT;
-+    static float toFloat() { return 0.0; }
-+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UINT32
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_SINT;
-+    static float toFloat() { return 0.0; }
-+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UNORM8
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-+    static float toFloat() { return 1.0f / 255.0f; }
-+    static float fromFloat() { return 255.0f; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UNORM8
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_SNORM;
-+    static float toFloat() { return 1.0f / 127.0f; }
-+    static float fromFloat() { return 127.0f; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UNORM16
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-+    static float toFloat() { return 1.0f / 65535.0f; }
-+    static float fromFloat() { return 65535.0f; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for SNORM16
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-+    static float toFloat() { return 1.0f / 32767.0f; }
-+    static float fromFloat() { return 32767.0f; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for UNORM24
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-+    static float toFloat() { return 1.0f / 16777215.0f; }
-+    static float fromFloat() { return 16777215.0f; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+// FLOAT Specializations from here on...
-+//////////////////////////////////////////////////////////////////////////
-+#define TO_M128i(a) _mm_castps_si128(a)
-+#define TO_M128(a) _mm_castsi128_ps(a)
-+
-+#include "math.h"
-+
-+template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-+inline static __m128 fastpow(__m128 arg) {
-+    __m128 ret = arg;
-+
-+    static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
-+        * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
-+
-+    // Apply a constant pre-correction factor.
-+    ret = _mm_mul_ps(ret, factor);
-+
-+    // Reinterpret arg as integer to obtain logarithm.
-+    //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
-+    ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
-+
-+    // Multiply logarithm by power.
-+    ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
-+
-+    // Convert back to "integer" to exponentiate.
-+    //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
-+    ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
-+
-+    return ret;
-+}
-+
-+inline static __m128 pow512_4(__m128 arg) {
-+    // 5/12 is too small, so compute the 4th root of 20/12 instead.
-+    // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
-+    // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
-+    __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
-+    __m128 xover = _mm_mul_ps(arg, xf);
-+
-+    __m128 xfm1 = _mm_rsqrt_ps(xf);
-+    __m128 x2 = _mm_mul_ps(arg, arg);
-+    __m128 xunder = _mm_mul_ps(x2, xfm1);
-+
-+    // sqrt2 * over + 2 * sqrt2 * under
-+    __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
-+        _mm_add_ps(xover, xunder));
-+
-+    xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
-+    xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
-+    return xavg;
-+}
-+
-+inline static __m128 powf_wrapper(__m128 Base, float Exp)
-+{
-+    float *f = (float *)(&Base);
-+
-+    return _mm_set_ps(powf(f[0], Exp),
-+                      powf(f[1], Exp),
-+                      powf(f[2], Exp),
-+                      powf(f[3], Exp));
-+}
-+
-+static inline __m128 ConvertFloatToSRGB2(__m128& Src)
-+{
-+    // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
-+    __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
-+
-+    // squeeze the mask down to 16 bits (4 bits per DWORD)
-+    int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
-+
-+    __m128 Result;
-+
-+    //
-+    if (CompareResult == 0xFFFF)
-+    {
-+        // all DWORDs are <= the threshold
-+        Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
-+    }
-+    else if (CompareResult == 0x0)
-+    {
-+        // all DWORDs are > the threshold
-+        __m128 fSrc_0RGB = Src;
-+
-+        // --> 1.055f * c(1.0f/2.4f) - 0.055f
-+#if KNOB_USE_FAST_SRGB == TRUE
-+        // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
-+        __m128 f = pow512_4(fSrc_0RGB);
-+#else
-+        __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
-+#endif
-+        f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
-+        Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
-+    }
-+    else
-+    {
-+        // some DWORDs are <= the threshold and some are > threshold
-+        __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
-+
-+        __m128 fSrc_0RGB = Src;
-+
-+        // --> 1.055f * c(1.0f/2.4f) - 0.055f
-+#if KNOB_USE_FAST_SRGB == TRUE
-+        // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
-+        __m128 f = pow512_4(fSrc_0RGB);
-+#else
-+        __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
-+#endif
-+        f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
-+        f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
-+
-+        // Clear the alpha (is garbage after the sub)
-+        __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
-+
-+        __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
-+        __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
-+        __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
-+
-+        Result = TO_M128(CombinedParts);
-+    }
-+
-+    return Result;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for FLOAT16
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
-+    static float toFloat() { return 1.0f; }
-+    static float fromFloat() { return 1.0f; }
-+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
-+
-+    static simdscalar pack(const simdscalar &in)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+#if (KNOB_ARCH == KNOB_ARCH_AVX)
-+        // input is 8 packed float32, output is 8 packed float16
-+        simdscalari src = _simd_castps_si(in);
-+
-+        static const uint32_t FLOAT_EXP_BITS = 8;
-+        static const uint32_t FLOAT_MANTISSA_BITS = 23;
-+        static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
-+        static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
-+
-+        static const uint32_t HALF_EXP_BITS = 5;
-+        static const uint32_t HALF_MANTISSA_BITS = 10;
-+        static const uint32_t HALF_MANTISSA_MASK = (1U << HALF_MANTISSA_BITS) - 1;
-+        static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
-+
-+        // minimum exponent required, exponents below this are flushed to 0.
-+        static const int32_t HALF_EXP_MIN = -14;
-+        static const int32_t FLOAT_EXP_BIAS = 127;
-+        static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
-+        static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
-+
-+        // maximum exponent required, exponents above this are set to infinity
-+        static const int32_t HALF_EXP_MAX = 15;
-+        static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
-+
-+        const simdscalari vSignMask     = _simd_set1_epi32(0x80000000);
-+        const simdscalari vExpMask      = _simd_set1_epi32(FLOAT_EXP_MASK);
-+        const simdscalari vManMask      = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
-+        const simdscalari vExpMin       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
-+        const simdscalari vExpMinFtz    = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
-+        const simdscalari vExpMax       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
-+
-+        simdscalari vSign       = _simd_and_si(src, vSignMask);
-+        simdscalari vExp        = _simd_and_si(src, vExpMask);
-+        simdscalari vMan        = _simd_and_si(src, vManMask);
-+
-+        simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
-+        simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
-+        simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
-+        simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
-+
-+        simdscalari vHalfExp    = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
-+
-+        // pack output 16-bits into the lower 16-bits of each 32-bit channel
-+        simdscalari vDst        = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
-+        vDst   = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
-+
-+        // Flush To Zero
-+        vDst   = _simd_andnot_si(vFTZMask, vDst);
-+        // Apply Infinites / NaN
-+        vDst   = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
-+
-+        // Apply clamps
-+        vDst = _simd_andnot_si(vClampMask, vDst);
-+        vDst = _simd_or_si(vDst,
-+                _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
-+
-+        // Compute Denormals (subnormals)
-+        if (!_mm256_testz_si256(vDenormMask, vDenormMask))
-+        {
-+            uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
-+            uint32_t *pExp = (uint32_t*)&vExp;
-+            uint32_t *pMan = (uint32_t*)&vMan;
-+            uint32_t *pDst = (uint32_t*)&vDst;
-+            for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-+            {
-+                if (pDenormMask[i])
-+                {
-+                    // Need to compute subnormal value
-+                    uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
-+                    uint32_t mantissa = pMan[i] |
-+                                        (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.  Make it explicit
-+
-+                    pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
-+                }
-+            }
-+        }
-+
-+        // Add in sign bits
-+        vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
-+
-+        // Pack to lower 128-bits
-+        vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
-+
-+#if 0
-+#if !defined(NDEBUG)
-+        simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
-+
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
-+        }
-+#endif
-+#endif
-+
-+        return _simd_castsi_ps(vDst);
-+
-+#else
-+        return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
-+#endif
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+
-+    static simdscalar unpack(const simdscalar &in)
-+    {
-+        // input is 8 packed float16, output is 8 packed float32
-+        SWR_ASSERT(0); // @todo
-+        return _simd_setzero_ps();
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TypeTraits - Format type traits specialization for FLOAT32
-+//////////////////////////////////////////////////////////////////////////
-+template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
-+{
-+    static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
-+    static float toFloat() { return 1.0f; }
-+    static float fromFloat() { return 1.0f; }
-+    static inline simdscalar convertSrgb(simdscalar &in)
-+    {
-+#if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
-+        __m128 srcLo = _mm256_extractf128_ps(in, 0);
-+        __m128 srcHi = _mm256_extractf128_ps(in, 1);
-+
-+        srcLo = ConvertFloatToSRGB2(srcLo);
-+        srcHi = ConvertFloatToSRGB2(srcHi);
-+
-+        in = _mm256_insertf128_ps(in, srcLo, 0);
-+        in = _mm256_insertf128_ps(in, srcHi, 1);
-+
-+#endif
-+        return in;
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format1 - Bitfield for single component formats.
-+//////////////////////////////////////////////////////////////////////////
-+template<uint32_t x>
-+struct Format1
-+{
-+    union
-+    {
-+        uint32_t r : x;
-+
-+        ///@ The following are here to provide full template needed in Formats.
-+        uint32_t g : x;
-+        uint32_t b : x;
-+        uint32_t a : x;
-+    };
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format1 - Bitfield for single component formats - 8 bit specialization
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+struct Format1<8>
-+{
-+    union
-+    {
-+        uint8_t r;
-+
-+        ///@ The following are here to provide full template needed in Formats.
-+        uint8_t g;
-+        uint8_t b;
-+        uint8_t a;
-+    };
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format1 - Bitfield for single component formats - 16 bit specialization
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+struct Format1<16>
-+{
-+    union
-+    {
-+        uint16_t r;
-+
-+        ///@ The following are here to provide full template needed in Formats.
-+        uint16_t g;
-+        uint16_t b;
-+        uint16_t a;
-+    };
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format2 - Bitfield for 2 component formats.
-+//////////////////////////////////////////////////////////////////////////
-+template<uint32_t x, uint32_t y>
-+union Format2
-+{
-+    struct
-+    {
-+        uint32_t r : x;
-+        uint32_t g : y;
-+    };
-+    struct
-+    {
-+        ///@ The following are here to provide full template needed in Formats.
-+        uint32_t b : x;
-+        uint32_t a : y;
-+    };
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format2 - Bitfield for 2 component formats - 16 bit specialization
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+union Format2<8,8>
-+{
-+    struct
-+    {
-+        uint16_t r : 8;
-+        uint16_t g : 8;
-+    };
-+    struct
-+    {
-+        ///@ The following are here to provide full template needed in Formats.
-+        uint16_t b : 8;
-+        uint16_t a : 8;
-+    };
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format3 - Bitfield for 3 component formats.
-+//////////////////////////////////////////////////////////////////////////
-+template<uint32_t x, uint32_t y, uint32_t z>
-+union Format3
-+{
-+    struct
-+    {
-+        uint32_t r : x;
-+        uint32_t g : y;
-+        uint32_t b : z;
-+    };
-+    uint32_t a;  ///@note This is here to provide full template needed in Formats.
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format3 - Bitfield for 3 component formats - 16 bit specialization
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+union Format3<5,6,5>
-+{
-+    struct
-+    {
-+        uint16_t r : 5;
-+        uint16_t g : 6;
-+        uint16_t b : 5;
-+    };
-+    uint16_t a;  ///@note This is here to provide full template needed in Formats.
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format4 - Bitfield for 4 component formats.
-+//////////////////////////////////////////////////////////////////////////
-+template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
-+struct Format4
-+{
-+    uint32_t r : x;
-+    uint32_t g : y;
-+    uint32_t b : z;
-+    uint32_t a : w;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format4 - Bitfield for 4 component formats - 16 bit specialization
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+struct Format4<5,5,5,1>
-+{
-+    uint16_t r : 5;
-+    uint16_t g : 5;
-+    uint16_t b : 5;
-+    uint16_t a : 1;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Format4 - Bitfield for 4 component formats - 16 bit specialization
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+struct Format4<4,4,4,4>
-+{
-+    uint16_t r : 4;
-+    uint16_t g : 4;
-+    uint16_t b : 4;
-+    uint16_t a : 4;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// ComponentTraits - Default components
-+//////////////////////////////////////////////////////////////////////////
-+template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
-+struct Defaults
-+{
-+    INLINE static uint32_t GetDefault(uint32_t comp)
-+    {
-+        static const uint32_t defaults[4]{ x, y, z, w };
-+        return defaults[comp];
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// ComponentTraits - Component type traits.
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
-+struct ComponentTraits
-+{
-+    INLINE static SWR_TYPE GetType(uint32_t comp)
-+    {
-+        static const SWR_TYPE CompType[4]{ X, Y, Z, W };
-+        return CompType[comp];
-+    }
-+
-+    INLINE static uint32_t GetBPC(uint32_t comp)
-+    {
-+        static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
-+        return MyBpc[comp];
-+    }
-+
-+    INLINE static bool isNormalized(uint32_t comp)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
-+        case 1:
-+            return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
-+        case 2:
-+            return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
-+        case 3:
-+            return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
-+        }
-+        SWR_ASSERT(0);
-+        return false;
-+    }
-+
-+    INLINE static float toFloat(uint32_t comp)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            return TypeTraits<X, NumBitsX>::toFloat();
-+        case 1:
-+            return TypeTraits<Y, NumBitsY>::toFloat();
-+        case 2:
-+            return TypeTraits<Z, NumBitsZ>::toFloat();
-+        case 3:
-+            return TypeTraits<W, NumBitsW>::toFloat();
-+        }
-+        SWR_ASSERT(0);
-+        return TypeTraits<X, NumBitsX>::toFloat();
-+
-+    }
-+
-+    INLINE static float fromFloat(uint32_t comp)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            return TypeTraits<X, NumBitsX>::fromFloat();
-+        case 1:
-+            return TypeTraits<Y, NumBitsY>::fromFloat();
-+        case 2:
-+            return TypeTraits<Z, NumBitsZ>::fromFloat();
-+        case 3:
-+            return TypeTraits<W, NumBitsW>::fromFloat();
-+        }
-+        SWR_ASSERT(0);
-+        return TypeTraits<X, NumBitsX>::fromFloat();
-+    }
-+
-+    INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
-+        case 1:
-+            return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
-+        case 2:
-+            return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
-+        case 3:
-+            return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
-+        }
-+        SWR_ASSERT(0);
-+        return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
-+    }
-+
-+    INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
-+            return;
-+        case 1:
-+            TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
-+            return;
-+        case 2:
-+            TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
-+            return;
-+        case 3:
-+            TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
-+            return;
-+        }
-+        SWR_ASSERT(0);
-+        TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
-+    }
-+
-+    INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            return TypeTraits<X, NumBitsX>::unpack(in);
-+        case 1:
-+            return TypeTraits<Y, NumBitsY>::unpack(in);
-+        case 2:
-+            return TypeTraits<Z, NumBitsZ>::unpack(in);
-+        case 3:
-+            return TypeTraits<W, NumBitsW>::unpack(in);
-+        }
-+        SWR_ASSERT(0);
-+        return TypeTraits<X, NumBitsX>::unpack(in);
-+    }
-+
-+    INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            return TypeTraits<X, NumBitsX>::pack(in);
-+        case 1:
-+            return TypeTraits<Y, NumBitsY>::pack(in);
-+        case 2:
-+            return TypeTraits<Z, NumBitsZ>::pack(in);
-+        case 3:
-+            return TypeTraits<W, NumBitsW>::pack(in);
-+        }
-+        SWR_ASSERT(0);
-+        return TypeTraits<X, NumBitsX>::pack(in);
-+    }
-+
-+    INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
-+    {
-+        switch (comp)
-+        {
-+        case 0:
-+            return TypeTraits<X, NumBitsX>::convertSrgb(in);;
-+        case 1:
-+            return TypeTraits<Y, NumBitsY>::convertSrgb(in);;
-+        case 2:
-+            return TypeTraits<Z, NumBitsZ>::convertSrgb(in);;
-+        case 3:
-+            return TypeTraits<W, NumBitsW>::convertSrgb(in);;
-+        }
-+        SWR_ASSERT(0);
-+        return TypeTraits<X, NumBitsX>::convertSrgb(in);
-+    }
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
-new file mode 100644
-index 0000000..986e49f
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
-@@ -0,0 +1,1972 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file frontend.cpp
-+*
-+* @brief Implementation for Frontend which handles vertex processing,
-+*        primitive assembly, clipping, binning, etc.
-+*
-+******************************************************************************/
-+
-+#include "api.h"
-+#include "frontend.h"
-+#include "backend.h"
-+#include "context.h"
-+#include "rdtsc_core.h"
-+#include "rasterizer.h"
-+#include "utils.h"
-+#include "threads.h"
-+#include "pa.h"
-+#include "clip.h"
-+#include "tilemgr.h"
-+#include "tessellator.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Helper macro to generate a bitmask
-+static INLINE uint32_t GenMask(uint32_t numBits)
-+{
-+    SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
-+    return ((1U << numBits) - 1);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief FE handler for SwrSync.
-+/// @param pContext - pointer to SWR context.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param pUserData - Pointer to user data passed back to sync callback.
-+/// @todo This should go away when we switch this to use compute threading.
-+void ProcessSync(
-+    SWR_CONTEXT *pContext,
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    void *pUserData)
-+{
-+    SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
-+    BE_WORK work;
-+    work.type = SYNC;
-+    work.pfnWork = ProcessSyncBE;
-+    work.desc.sync = *pSync;
-+
-+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+    pTileMgr->enqueue(0, 0, &work);
-+
-+    _ReadWriteBarrier();
-+    pDC->doneFE = true;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief FE handler for SwrGetStats.
-+/// @param pContext - pointer to SWR context.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param pUserData - Pointer to user data passed back to stats callback.
-+/// @todo This should go away when we switch this to use compute threading.
-+void ProcessQueryStats(
-+    SWR_CONTEXT *pContext,
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    void *pUserData)
-+{
-+    QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
-+    BE_WORK work;
-+    work.type = QUERYSTATS;
-+    work.pfnWork = ProcessQueryStatsBE;
-+    work.desc.queryStats = *pQueryStats;
-+
-+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+    pTileMgr->enqueue(0, 0, &work);
-+
-+    _ReadWriteBarrier();
-+    pDC->doneFE = true;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief FE handler for SwrClearRenderTarget.
-+/// @param pContext - pointer to SWR context.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param pUserData - Pointer to user data passed back to clear callback.
-+/// @todo This should go away when we switch this to use compute threading.
-+void ProcessClear(
-+    SWR_CONTEXT *pContext,
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    void *pUserData)
-+{
-+    CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+
-+    const API_STATE& state = GetApiState(pDC);
-+
-+    // queue a clear to each macro tile
-+    // compute macro tile bounds for the current scissor/viewport
-+    uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
-+    uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
-+    uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
-+    uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
-+
-+    BE_WORK work;
-+    work.type = CLEAR;
-+    work.pfnWork = ProcessClearBE;
-+    work.desc.clear = *pClear;
-+
-+    for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
-+    {
-+        for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
-+        {
-+            pTileMgr->enqueue(x, y, &work);
-+        }
-+    }
-+
-+    _ReadWriteBarrier();
-+    pDC->doneFE = true;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief FE handler for SwrStoreTiles.
-+/// @param pContext - pointer to SWR context.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param pUserData - Pointer to user data passed back to callback.
-+/// @todo This should go away when we switch this to use compute threading.
-+void ProcessStoreTiles(
-+    SWR_CONTEXT *pContext,
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    void *pUserData)
-+{
-+    RDTSC_START(FEProcessStoreTiles);
-+    STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
-+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+
-+    const API_STATE& state = GetApiState(pDC);
-+
-+    // queue a store to each macro tile
-+    // compute macro tile bounds for the current render target
-+    const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
-+    const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-+
-+    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
-+    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
-+
-+    // store tiles
-+    BE_WORK work;
-+    work.type = STORETILES;
-+    work.pfnWork = ProcessStoreTileBE;
-+    work.desc.storeTiles = *pStore;
-+
-+    for (uint32_t x = 0; x < numMacroTilesX; ++x)
-+    {
-+        for (uint32_t y = 0; y < numMacroTilesY; ++y)
-+        {
-+            pTileMgr->enqueue(x, y, &work);
-+        }
-+    }
-+
-+    _ReadWriteBarrier();
-+    pDC->doneFE = true;
-+
-+    RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief FE handler for SwrInvalidateTiles.
-+/// @param pContext - pointer to SWR context.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param pUserData - Pointer to user data passed back to callback.
-+/// @todo This should go away when we switch this to use compute threading.
-+void ProcessInvalidateTiles(
-+    SWR_CONTEXT *pContext,
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    void *pUserData)
-+{
-+    RDTSC_START(FEProcessInvalidateTiles);
-+    INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
-+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+
-+    const API_STATE& state = GetApiState(pDC);
-+
-+    // queue a store to each macro tile
-+    // compute macro tile bounds for the current render target
-+    uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
-+    uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-+
-+    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
-+    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
-+
-+    // load tiles
-+    BE_WORK work;
-+    work.type = INVALIDATETILES;
-+    work.pfnWork = ProcessInvalidateTilesBE;
-+    work.desc.invalidateTiles = *pInv;
-+
-+    for (uint32_t x = 0; x < numMacroTilesX; ++x)
-+    {
-+        for (uint32_t y = 0; y < numMacroTilesY; ++y)
-+        {
-+            pTileMgr->enqueue(x, y, &work);
-+        }
-+    }
-+
-+    _ReadWriteBarrier();
-+    pDC->doneFE = true;
-+
-+    RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes the number of primitives given the number of verts.
-+/// @param mode - primitive topology for draw operation.
-+/// @param numElements - number of vertices or indices for draw.
-+/// @todo Frontend needs to be refactored. This will go in appropriate place then.
-+uint32_t GetNumPrims(
-+    PRIMITIVE_TOPOLOGY mode,
-+    uint32_t numElements)
-+{
-+    switch (mode)
-+    {
-+    case TOP_POINT_LIST: return numElements;
-+    case TOP_TRIANGLE_LIST: return numElements / 3;
-+    case TOP_TRIANGLE_STRIP: return numElements < 3 ? 0 : numElements - 2;
-+    case TOP_TRIANGLE_FAN: return numElements < 3 ? 0 : numElements - 2;
-+    case TOP_TRIANGLE_DISC: return numElements < 2 ? 0 : numElements - 1;
-+    case TOP_QUAD_LIST: return numElements / 4;
-+    case TOP_QUAD_STRIP: return numElements < 4 ? 0 : (numElements - 2) / 2;
-+    case TOP_LINE_STRIP: return numElements < 2 ? 0 : numElements - 1;
-+    case TOP_LINE_LIST: return numElements / 2;
-+    case TOP_LINE_LOOP: return numElements;
-+    case TOP_RECT_LIST: return numElements / 3;
-+
-+    case TOP_PATCHLIST_1:
-+    case TOP_PATCHLIST_2:
-+    case TOP_PATCHLIST_3:
-+    case TOP_PATCHLIST_4:
-+    case TOP_PATCHLIST_5:
-+    case TOP_PATCHLIST_6:
-+    case TOP_PATCHLIST_7:
-+    case TOP_PATCHLIST_8:
-+    case TOP_PATCHLIST_9:
-+    case TOP_PATCHLIST_10:
-+    case TOP_PATCHLIST_11:
-+    case TOP_PATCHLIST_12:
-+    case TOP_PATCHLIST_13:
-+    case TOP_PATCHLIST_14:
-+    case TOP_PATCHLIST_15:
-+    case TOP_PATCHLIST_16:
-+    case TOP_PATCHLIST_17:
-+    case TOP_PATCHLIST_18:
-+    case TOP_PATCHLIST_19:
-+    case TOP_PATCHLIST_20:
-+    case TOP_PATCHLIST_21:
-+    case TOP_PATCHLIST_22:
-+    case TOP_PATCHLIST_23:
-+    case TOP_PATCHLIST_24:
-+    case TOP_PATCHLIST_25:
-+    case TOP_PATCHLIST_26:
-+    case TOP_PATCHLIST_27:
-+    case TOP_PATCHLIST_28:
-+    case TOP_PATCHLIST_29:
-+    case TOP_PATCHLIST_30:
-+    case TOP_PATCHLIST_31:
-+    case TOP_PATCHLIST_32:
-+        return numElements / (mode - TOP_PATCHLIST_BASE);
-+
-+    case TOP_LINE_LIST_ADJ:
-+    case TOP_LISTSTRIP_ADJ:
-+    case TOP_TRI_LIST_ADJ:
-+    case TOP_TRI_STRIP_ADJ:
-+    case TOP_TRI_STRIP_REVERSE:
-+    case TOP_POLYGON:
-+    case TOP_POINT_LIST_BF:
-+    case TOP_LINE_STRIP_CONT:
-+    case TOP_LINE_STRIP_BF:
-+    case TOP_LINE_STRIP_CONT_BF:
-+    case TOP_TRIANGLE_FAN_NOSTIPPLE:
-+    case TOP_PATCHLIST_BASE:
-+    case TOP_UNKNOWN:
-+        SWR_ASSERT(false, "Unsupported topology: %d", mode);
-+        return 0;
-+    }
-+
-+    return 0;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Return number of verts per primitive.
-+/// @param topology - topology
-+/// @param includeAdjVerts - include adjacent verts in primitive vertices
-+INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
-+{
-+    uint32_t numVerts = 0;
-+    switch (topology)
-+    {
-+    case TOP_POINT_LIST:
-+    case TOP_POINT_LIST_BF:
-+        numVerts = 1;
-+        break;
-+    case TOP_LINE_LIST:
-+    case TOP_LINE_STRIP:
-+    case TOP_LINE_LIST_ADJ:
-+    case TOP_LINE_LOOP:
-+    case TOP_LINE_STRIP_CONT:
-+    case TOP_LINE_STRIP_BF:
-+    case TOP_LISTSTRIP_ADJ:
-+        numVerts = 2;
-+        break;
-+    case TOP_TRIANGLE_LIST:
-+    case TOP_TRIANGLE_STRIP:
-+    case TOP_TRIANGLE_FAN:
-+    case TOP_TRI_LIST_ADJ:
-+    case TOP_TRI_STRIP_ADJ:
-+    case TOP_TRI_STRIP_REVERSE:
-+    case TOP_RECT_LIST:
-+        numVerts = 3;
-+        break;
-+    case TOP_QUAD_LIST:
-+    case TOP_QUAD_STRIP:
-+        numVerts = 4;
-+        break;
-+    case TOP_PATCHLIST_1:
-+    case TOP_PATCHLIST_2:
-+    case TOP_PATCHLIST_3:
-+    case TOP_PATCHLIST_4:
-+    case TOP_PATCHLIST_5:
-+    case TOP_PATCHLIST_6:
-+    case TOP_PATCHLIST_7:
-+    case TOP_PATCHLIST_8:
-+    case TOP_PATCHLIST_9:
-+    case TOP_PATCHLIST_10:
-+    case TOP_PATCHLIST_11:
-+    case TOP_PATCHLIST_12:
-+    case TOP_PATCHLIST_13:
-+    case TOP_PATCHLIST_14:
-+    case TOP_PATCHLIST_15:
-+    case TOP_PATCHLIST_16:
-+    case TOP_PATCHLIST_17:
-+    case TOP_PATCHLIST_18:
-+    case TOP_PATCHLIST_19:
-+    case TOP_PATCHLIST_20:
-+    case TOP_PATCHLIST_21:
-+    case TOP_PATCHLIST_22:
-+    case TOP_PATCHLIST_23:
-+    case TOP_PATCHLIST_24:
-+    case TOP_PATCHLIST_25:
-+    case TOP_PATCHLIST_26:
-+    case TOP_PATCHLIST_27:
-+    case TOP_PATCHLIST_28:
-+    case TOP_PATCHLIST_29:
-+    case TOP_PATCHLIST_30:
-+    case TOP_PATCHLIST_31:
-+    case TOP_PATCHLIST_32:
-+        numVerts = topology - TOP_PATCHLIST_BASE;
-+        break;
-+    default:
-+        SWR_ASSERT(false, "Unsupported topology: %d", topology);
-+        break;
-+    }
-+
-+    if (includeAdjVerts)
-+    {
-+        switch (topology)
-+        {
-+        case TOP_LISTSTRIP_ADJ:
-+        case TOP_LINE_LIST_ADJ: numVerts = 4; break;
-+        case TOP_TRI_STRIP_ADJ:
-+        case TOP_TRI_LIST_ADJ: numVerts = 6; break;
-+        default: break;
-+        }
-+    }
-+
-+    return numVerts;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief StreamOut - Streams vertex data out to SO buffers.
-+///        Generally, we are only streaming out a SIMDs worth of triangles.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
-+static void StreamOut(
-+    DRAW_CONTEXT* pDC,
-+    PA_STATE& pa,
-+    uint32_t workerId,
-+    uint32_t* pPrimData)
-+{
-+    RDTSC_START(FEStreamout);
-+
-+    SWR_CONTEXT* pContext = pDC->pContext;
-+
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_STREAMOUT_STATE &soState = state.soState;
-+
-+    uint32_t streamIndex = 0; ///@todo Stream index will come from PA_STATE.
-+    uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
-+
-+    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
-+    uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
-+
-+    SWR_STREAMOUT_CONTEXT soContext = { 0 };
-+
-+    // Setup buffer state pointers.
-+    for (uint32_t i = 0; i < 4; ++i)
-+    {
-+        soContext.pBuffer[i] = &state.soBuffer[i];
-+    }
-+
-+    uint32_t numPrims = pa.NumPrims();
-+    for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
-+    {
-+        DWORD slot = 0;
-+        uint32_t soMask = soState.streamMasks[streamIndex];
-+
-+        // Write all entries into primitive data buffer for SOS.
-+        while (_BitScanForward(&slot, soMask))
-+        {
-+            __m128 attrib[MAX_ATTRIBUTES];    // prim attribs (always 4 wide)
-+            uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
-+            pa.AssembleSingle(paSlot, primIndex, attrib);
-+
-+            // Attribute offset is relative offset from start of vertex.
-+            // Note that attributes start at slot 1 in the PA buffer. We need to write this
-+            // to prim data starting at slot 0. Which is why we do (slot - 1).
-+            // Also note: GL works slightly differently, and needs slot 0
-+            uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
-+
-+            // Store each vertex's attrib at appropriate locations in pPrimData buffer.
-+            for (uint32_t v = 0; v < soVertsPerPrim; ++v)
-+            {
-+                uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
-+
-+                _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
-+            }
-+            soMask &= ~(1 << slot);
-+        }
-+
-+        // Update pPrimData pointer 
-+        soContext.pPrimData = pPrimData;
-+
-+        // Call SOS
-+        state.pfnSoFunc[streamIndex](soContext);
-+    }
-+
-+    // Update SO write offset. The driver provides memory for the update.
-+    for (uint32_t i = 0; i < 4; ++i)
-+    {
-+        if (state.soBuffer[i].pWriteOffset)
-+        {
-+            *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
-+
-+            // The SOS increments the existing write offset. So we don't want to increment
-+            // the SoWriteOffset stat using an absolute offset instead of relative.
-+            SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
-+        }
-+    }
-+
-+    UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
-+    UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
-+
-+    RDTSC_STOP(FEStreamout, 1, 0);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes number of invocations. The current index represents
-+///        the start of the SIMD. The max index represents how much work
-+///        items are remaining. If there is less then a SIMD's left of work
-+///        then return the remaining amount of work.
-+/// @param curIndex - The start index for the SIMD.
-+/// @param maxIndex - The last index for all work items.
-+static INLINE uint32_t GetNumInvocations(
-+    uint32_t curIndex,
-+    uint32_t maxIndex)
-+{
-+    uint32_t remainder = (maxIndex - curIndex);
-+    return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Implements GS stage.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param pa - The primitive assembly object.
-+/// @param pGsOut - output stream for GS
-+template <
-+    bool HasStreamOutT,
-+    bool HasRastT>
-+static void GeometryShaderStage(
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    PA_STATE& pa,
-+    void* pGsOut,
-+    void* pCutBuffer,
-+    uint32_t* pSoPrimData,
-+    simdscalari primID)
-+{
-+    RDTSC_START(FEGeometryShader);
-+
-+    SWR_GS_CONTEXT gsContext;
-+    SWR_CONTEXT* pContext = pDC->pContext;
-+
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_GS_STATE* pState = &state.gsState;
-+
-+    SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
-+    SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
-+
-+    gsContext.pStream[0] = (uint8_t*)pGsOut;
-+    gsContext.pCutBuffer = (uint8_t*)pCutBuffer;
-+    gsContext.PrimitiveID = primID;
-+
-+    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
-+    simdvector attrib[MAX_ATTRIBUTES];
-+
-+    // assemble all attributes for the input primitive
-+    for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
-+    {
-+        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
-+        pa.Assemble(attribSlot, attrib);
-+
-+        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
-+        {
-+            gsContext.vert[i].attrib[attribSlot] = attrib[i];
-+        }
-+    }
-+    
-+    // assemble position
-+    pa.Assemble(VERTEX_POSITION_SLOT, attrib);
-+    for (uint32_t i = 0; i < numVertsPerPrim; ++i)
-+    {
-+        gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
-+    }
-+
-+    const uint32_t vertexStride = sizeof(simdvertex);
-+    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
-+    const uint32_t inputPrimStride = numSimdBatches * vertexStride;
-+    const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
-+    const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
-+    const uint32_t cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
-+    for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
-+    {
-+        gsContext.InstanceID = instance;
-+
-+        // execute the geometry shader
-+        state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
-+
-+        gsContext.pStream[0] += instanceStride;
-+        gsContext.pCutBuffer += cutInstanceStride;
-+    }
-+
-+    // record valid prims from the frontend to avoid over binning the newly generated
-+    // prims from the GS
-+    uint32_t numInputPrims = pa.NumPrims();
-+
-+    // set up new binner and state for the GS output topology
-+    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
-+    if (HasRastT)
-+    {
-+        switch (pState->outputTopology)
-+        {
-+        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
-+        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
-+        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
-+        default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
-+        }
-+    }
-+
-+    // foreach input prim:
-+    // - setup a new PA based on the emitted verts for that prim
-+    // - loop over the new verts, calling PA to assemble each prim
-+    uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
-+    uint32_t* pPrimitiveId = (uint32_t*)&primID;
-+
-+    uint32_t totalPrimsGenerated = 0;
-+    for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
-+    {
-+        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
-+        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
-+        for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
-+        {
-+            uint32_t numEmittedVerts = pVertexCount[inputPrim];
-+            if (numEmittedVerts == 0)
-+            {
-+                continue;
-+            }
-+
-+            uint8_t* pBase = pInstanceBase + instance * instanceStride;
-+            uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
-+            
-+            DWORD numAttribs;
-+            _BitScanReverse(&numAttribs, state.feAttribMask);
-+            numAttribs++;
-+
-+            PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBase, numEmittedVerts, numAttribs, pState->outputTopology, true);
-+
-+            while (gsPa.GetNextStreamOutput())
-+            {
-+                do
-+                {
-+                    bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
-+
-+                    if (assemble)
-+                    {
-+                        totalPrimsGenerated += gsPa.NumPrims();
-+
-+                        if (HasStreamOutT)
-+                        {
-+                            StreamOut(pDC, gsPa, workerId, pSoPrimData);
-+                        }
-+
-+                        if (HasRastT)
-+                        {
-+                            simdscalari vPrimId;
-+                            // pull primitiveID from the GS output if available
-+                            if (state.gsState.emitsPrimitiveID)
-+                            {
-+                                simdvector primIdAttrib[3];
-+                                gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
-+                                vPrimId = _simd_castps_si(primIdAttrib[0].x);
-+                            }
-+                            else
-+                            {
-+                                vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
-+                            }
-+
-+                            pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
-+                        }
-+                    }
-+                } while (gsPa.NextPrim());
-+            }
-+        }
-+    }
-+
-+    // update GS pipeline stats
-+    UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
-+    UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
-+
-+    RDTSC_STOP(FEGeometryShader, 1, 0);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Allocate GS buffers
-+/// @param pDC - pointer to draw context.
-+/// @param state - API state
-+/// @param ppGsOut - pointer to GS output buffer allocation
-+/// @param ppCutBuffer - pointer to GS output cut buffer allocation
-+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer)
-+{
-+    SWR_ASSERT(state.gsState.gsEnable);
-+    // allocate arena space to hold GS output verts
-+    // @todo pack attribs
-+    // @todo support multiple streams
-+    const uint32_t vertexStride = sizeof(simdvertex);
-+    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
-+    uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
-+    *ppGsOut = pDC->arena.AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
-+
-+    // allocate arena space to hold cut buffer, which is essentially a bitfield sized to the
-+    // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
-+    const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
-+    const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
-+    *ppCutBuffer = pDC->arena.AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate mask from remaining work.
-+/// @param numWorkItems - Number of items being worked on by a SIMD.
-+static INLINE simdscalari GenerateMask(uint32_t numWorkItems)
-+{
-+    uint32_t numActive = (numWorkItems >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numWorkItems;
-+    uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
-+    return _simd_castps_si(vMask(mask));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Contains all data generated by the HS and passed to the
-+/// tessellator and DS.
-+struct TessellationThreadLocalData
-+{
-+    ScalarPatch patchData[KNOB_SIMD_WIDTH];
-+    void* pTxCtx;
-+    size_t tsCtxSize;
-+
-+    simdscalar* pDSOutput;
-+    size_t numDSOutputVectors;
-+};
-+
-+THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Allocate tessellation data for this worker thread.
-+INLINE
-+static void AllocateTessellationData(SWR_CONTEXT* pContext)
-+{
-+    /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
-+    if (gt_pTessellationThreadData == nullptr)
-+    {
-+        gt_pTessellationThreadData = (TessellationThreadLocalData*)
-+            _aligned_malloc(sizeof(TessellationThreadLocalData), 64);
-+        memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Implements Tessellation Stages.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param pa - The primitive assembly object.
-+/// @param pGsOut - output stream for GS
-+template <
-+    bool HasGeometryShaderT,
-+    bool HasStreamOutT,
-+    bool HasRastT>
-+static void TessellationStages(
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    PA_STATE& pa,
-+    void* pGsOut,
-+    void* pCutBuffer,
-+    uint32_t* pSoPrimData,
-+    simdscalari primID)
-+{
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_TS_STATE& tsState = state.tsState;
-+    SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
-+
-+    SWR_ASSERT(gt_pTessellationThreadData);
-+
-+    HANDLE tsCtx = TSInitCtx(
-+        tsState.domain,
-+        tsState.partitioning,
-+        tsState.tsOutputTopology,
-+        gt_pTessellationThreadData->pTxCtx,
-+        gt_pTessellationThreadData->tsCtxSize);
-+    if (tsCtx == nullptr)
-+    {
-+        gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64);
-+        tsCtx = TSInitCtx(
-+            tsState.domain,
-+            tsState.partitioning,
-+            tsState.tsOutputTopology,
-+            gt_pTessellationThreadData->pTxCtx,
-+            gt_pTessellationThreadData->tsCtxSize);
-+    }
-+    SWR_ASSERT(tsCtx);
-+
-+    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
-+    if (HasRastT)
-+    {
-+        switch (tsState.postDSTopology)
-+        {
-+        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
-+        case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
-+        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
-+        default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
-+        }
-+    }
-+
-+    SWR_HS_CONTEXT hsContext;
-+    hsContext.pCPout = gt_pTessellationThreadData->patchData;
-+    hsContext.PrimitiveID = primID;
-+
-+    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
-+    // Max storage for one attribute for an entire simdprimitive
-+    simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
-+
-+    // assemble all attributes for the input primitives
-+    for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
-+    {
-+        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
-+        pa.Assemble(attribSlot, simdattrib);
-+
-+        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
-+        {
-+            hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
-+        }
-+    }
-+
-+#if defined(_DEBUG)
-+    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
-+#endif
-+
-+    // Run the HS
-+    RDTSC_START(FEHullShader);
-+    state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
-+    RDTSC_STOP(FEHullShader, 0, 0);
-+
-+    uint32_t numPrims = pa.NumPrims();
-+    UPDATE_STAT(HsInvocations, numPrims);
-+
-+    const uint32_t* pPrimId = (const uint32_t*)&primID;
-+
-+    for (uint32_t p = 0; p < numPrims; ++p)
-+    {
-+        // Run Tessellator
-+        SWR_TS_TESSELLATED_DATA tsData = { 0 };
-+        RDTSC_START(FETessellation);
-+        TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
-+        RDTSC_STOP(FETessellation, 0, 0);
-+
-+        if (tsData.NumPrimitives == 0)
-+        {
-+            continue;
-+        }
-+        SWR_ASSERT(tsData.NumDomainPoints);
-+
-+        // Allocate DS Output memory
-+        uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
-+        size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
-+        if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
-+        {
-+            _aligned_free(gt_pTessellationThreadData->pDSOutput);
-+            gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(sizeof(simdvector) * requiredDSOutputVectors, 64);
-+            gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
-+        }
-+        SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
-+        SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
-+
-+        // Run Domain Shader
-+        SWR_DS_CONTEXT dsContext;
-+        dsContext.PrimitiveID = pPrimId[p];
-+        dsContext.pCpIn = &hsContext.pCPout[p];
-+        dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
-+        dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
-+        dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
-+        dsContext.vectorStride = requiredDSVectorInvocations;
-+
-+        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
-+        {
-+            RDTSC_START(FEDomainShader);
-+            state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
-+            RDTSC_STOP(FEDomainShader, 0, 0);
-+        }
-+        UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
-+
-+        PA_TESS tessPa(
-+            pDC,
-+            dsContext.pOutputData,
-+            dsContext.vectorStride,
-+            tsState.numDsOutputAttribs,
-+            tsData.ppIndices,
-+            tsData.NumPrimitives,
-+            tsState.postDSTopology);
-+
-+        while (tessPa.HasWork())
-+        {
-+            simdvector prim[3]; // Only deal with triangles, lines, or points
-+            // PaAssemble returns false if there is not enough verts to assemble.
-+            RDTSC_START(FEPAAssemble);
-+            bool assemble = tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
-+            RDTSC_STOP(FEPAAssemble, 1, 0);
-+
-+            if (assemble)
-+            {
-+                if (HasGeometryShaderT)
-+                {
-+                    GeometryShaderStage<HasStreamOutT, HasRastT>(
-+                        pDC, workerId, tessPa, pGsOut, pCutBuffer, pSoPrimData,
-+                        _simd_set1_epi32(dsContext.PrimitiveID));
-+                }
-+                else
-+                {
-+                    if (HasStreamOutT)
-+                    {
-+                        StreamOut(pDC, tessPa, workerId, pSoPrimData);
-+                    }
-+
-+                    if (HasRastT)
-+                    {
-+                        SWR_ASSERT(pfnClipFunc);
-+                        pfnClipFunc(pDC, tessPa, workerId, prim,
-+                            GenMask(tessPa.NumPrims()), primID);
-+                    }
-+                }
-+            } // if (assemble)
-+
-+            tessPa.NextPrim();
-+
-+        } // while (tessPa.HasWork())
-+    } // for (uint32_t p = 0; p < numPrims; ++p)
-+
-+    TSDestroyCtx(tsCtx);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief FE handler for SwrDraw.
-+/// @tparam IsIndexedT - Is indexed drawing enabled
-+/// @tparam HasTessellationT - Is tessellation enabled
-+/// @tparam HasGeometryShaderT - Is the geometry shader stage enabled
-+/// @tparam HasStreamOutT - Is stream-out enabled
-+/// @tparam HasRastT - Is rasterization enabled
-+/// @param pContext - pointer to SWR context.
-+/// @param pDC - pointer to draw context.
-+/// @param workerId - thread's worker id.
-+/// @param pUserData - Pointer to DRAW_WORK
-+template <
-+    bool IsIndexedT,
-+    bool HasTessellationT,
-+    bool HasGeometryShaderT,
-+    bool HasStreamOutT,
-+    bool HasRastT>
-+void ProcessDraw(
-+    SWR_CONTEXT *pContext,
-+    DRAW_CONTEXT *pDC,
-+    uint32_t workerId,
-+    void *pUserData)
-+{
-+
-+#if KNOB_ENABLE_TOSS_POINTS
-+    if (KNOB_TOSS_QUEUE_FE)
-+    {
-+        pDC->doneFE = 1;
-+        return;
-+    }
-+#endif
-+
-+    RDTSC_START(FEProcessDraw);
-+
-+    DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
-+    const API_STATE&    state = GetApiState(pDC);
-+    __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-+    SWR_VS_CONTEXT      vsContext;
-+    simdvertex          vin;
-+
-+    int indexSize = 0;
-+    int32_t  endVertex = work.numVerts;
-+    const int32_t* pLastRequestedIndex = nullptr;
-+    if (IsIndexedT)
-+    {
-+        switch (work.type)
-+        {
-+        case R32_UINT:
-+            indexSize = sizeof(uint32_t);
-+            pLastRequestedIndex = &(work.pIB[endVertex]);
-+            break;
-+        case R16_UINT:
-+            indexSize = sizeof(uint16_t);
-+            // nasty address offset to last index
-+            pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
-+            break;
-+        case R8_UINT:
-+            indexSize = sizeof(uint8_t);
-+            // nasty address offset to last index
-+            pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
-+            break;
-+        default:
-+            SWR_ASSERT(0);
-+        }
-+    }
-+
-+    SWR_FETCH_CONTEXT fetchInfo = { 0 };
-+    fetchInfo.pStreams = &state.vertexBuffers[0];
-+    fetchInfo.StartInstance = work.startInstance;
-+    fetchInfo.StartVertex = 0;
-+
-+    vsContext.pVin = &vin;
-+
-+    if (IsIndexedT)
-+    {
-+        fetchInfo.BaseVertex = work.baseVertex;
-+
-+        // if the entire index buffer isn't being consumed, set the last index
-+        // so that fetches < a SIMD wide will be masked off
-+        fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
-+        if (pLastRequestedIndex < fetchInfo.pLastIndex)
-+        {
-+            fetchInfo.pLastIndex = pLastRequestedIndex;
-+        }
-+    }
-+    else
-+    {
-+        fetchInfo.StartVertex = work.startVertex;
-+    }
-+
-+#ifdef KNOB_ENABLE_RDTSC
-+    uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
-+#endif
-+
-+    void* pGsOut = nullptr;
-+    void* pCutBuffer = nullptr;
-+    if (HasGeometryShaderT)
-+    {
-+        AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer);
-+    }
-+
-+    if (HasTessellationT)
-+    {
-+        SWR_ASSERT(state.tsState.tsEnable == true);
-+        SWR_ASSERT(state.pfnHsFunc != nullptr);
-+        SWR_ASSERT(state.pfnDsFunc != nullptr);
-+
-+        AllocateTessellationData(pContext);
-+    }
-+    else
-+    {
-+        SWR_ASSERT(state.tsState.tsEnable == false);
-+        SWR_ASSERT(state.pfnHsFunc == nullptr);
-+        SWR_ASSERT(state.pfnDsFunc == nullptr);
-+    }
-+
-+    // allocate space for streamout input prim data
-+    uint32_t* pSoPrimData = nullptr;
-+    if (HasStreamOutT)
-+    {
-+        pSoPrimData = (uint32_t*)pDC->arena.AllocAligned(4096, 16);
-+    }
-+
-+    // choose primitive assembler
-+    PA_FACTORY paFactory(pDC, IsIndexedT, state.topology, work.numVerts);
-+    PA_STATE& pa = paFactory.GetPA();
-+
-+    /// @todo: temporarily move instance loop in the FE to ensure SO ordering
-+    for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
-+    {
-+        simdscalari vIndex;
-+        int32_t  i = 0;
-+
-+        if (IsIndexedT)
-+        {
-+            fetchInfo.pIndices = work.pIB;
-+        }
-+        else
-+        {
-+            vIndex = _simd_add_epi32(_simd_set1_epi32(i), vScale);
-+            fetchInfo.pIndices = (const int32_t*)&vIndex;
-+        }
-+
-+        fetchInfo.CurInstance = instanceNum;
-+        vsContext.InstanceID = instanceNum;
-+
-+        while (pa.HasWork())
-+        {
-+            // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
-+            // So we need to keep this outside of (i < endVertex) check.
-+            simdmask* pvCutIndices = nullptr;
-+            if (IsIndexedT)
-+            {
-+                pvCutIndices = &pa.GetNextVsIndices();
-+            }
-+
-+            simdvertex& vout = pa.GetNextVsOutput();
-+            vsContext.pVout = &vout;
-+
-+            if (i < endVertex)
-+            {
-+
-+                // 1. Execute FS/VS for a single SIMD.
-+                RDTSC_START(FEFetchShader);
-+                state.pfnFetchFunc(fetchInfo, vin);
-+                RDTSC_STOP(FEFetchShader, 0, 0);
-+
-+                // forward fetch generated vertex IDs to the vertex shader
-+                vsContext.VertexID = fetchInfo.VertexID;
-+
-+                // Setup active mask for vertex shader.
-+                vsContext.mask = GenerateMask(endVertex - i);
-+
-+                // forward cut mask to the PA
-+                if (IsIndexedT)
-+                {
-+                    *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
-+                }
-+
-+                UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
-+
-+#if KNOB_ENABLE_TOSS_POINTS
-+                if (!KNOB_TOSS_FETCH)
-+#endif
-+                {
-+                    RDTSC_START(FEVertexShader);
-+                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
-+                    RDTSC_STOP(FEVertexShader, 0, 0);
-+
-+                    UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
-+                }
-+            }
-+
-+            // 2. Assemble primitives given the last two SIMD.
-+            do
-+            {
-+                simdvector prim[MAX_NUM_VERTS_PER_PRIM];
-+                // PaAssemble returns false if there is not enough verts to assemble.
-+                RDTSC_START(FEPAAssemble);
-+                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
-+                RDTSC_STOP(FEPAAssemble, 1, 0);
-+
-+#if KNOB_ENABLE_TOSS_POINTS
-+                if (!KNOB_TOSS_FETCH)
-+#endif
-+                {
-+#if KNOB_ENABLE_TOSS_POINTS
-+                    if (!KNOB_TOSS_VS)
-+#endif
-+                    {
-+                        if (assemble)
-+                        {
-+                            UPDATE_STAT(IaPrimitives, pa.NumPrims());
-+
-+                            if (HasTessellationT)
-+                            {
-+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-+                                    pDC, workerId, pa, pGsOut, pCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
-+                            }
-+                            else if (HasGeometryShaderT)
-+                            {
-+                                GeometryShaderStage<HasStreamOutT, HasRastT>(
-+                                    pDC, workerId, pa, pGsOut, pCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
-+                            }
-+                            else
-+                            {
-+                                // If streamout is enabled then stream vertices out to memory.
-+                                if (HasStreamOutT)
-+                                {
-+                                    StreamOut(pDC, pa, workerId, pSoPrimData);
-+                                }
-+
-+                                if (HasRastT)
-+                                {
-+                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
-+                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
-+                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
-+                                }
-+                            }
-+                        }
-+                    }
-+                }
-+            } while (pa.NextPrim());
-+
-+            i += KNOB_SIMD_WIDTH;
-+            if (IsIndexedT)
-+            {
-+                fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
-+            }
-+            else
-+            {
-+                vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
-+            }
-+        }
-+        pa.Reset();
-+    }
-+
-+    _ReadWriteBarrier();
-+    pDC->doneFE = true;
-+    RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
-+}
-+// Explicit Instantiation of all combinations
-+template void ProcessDraw<false, false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, false, false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, false, false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, false, true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, false, true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, false, true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, false, true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<false, true,  true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  false, true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+template void ProcessDraw<true,  true,  true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Expland points to give them area.
-+/// @param tri - SOA vertices for triangles.
-+static INLINE void ExpandPoint(simdvector tri[3], simdscalar size)
-+{
-+    const float bloat = 0.5f;
-+
-+    const __m256 vAdjust0X = _mm256_set_ps(-bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat);
-+    const __m256 vAdjust0Y = _mm256_set_ps(-bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat);
-+    const __m256 vAdjust1X = _mm256_set_ps(bloat, -bloat, bloat, -bloat, bloat, -bloat, bloat, -bloat);
-+    const __m256 vAdjust1Y = _mm256_set_ps(bloat, bloat, bloat, bloat, bloat, bloat, bloat, bloat);
-+    const __m256 vAdjust2X = _mm256_set_ps(bloat, bloat, bloat, bloat, bloat, bloat, bloat, bloat);
-+    const __m256 vAdjust2Y = _mm256_set_ps(-bloat, bloat, -bloat, bloat, -bloat, bloat, -bloat, bloat);
-+
-+    tri[0].x = _simd_fmadd_ps(size, vAdjust0X, tri[0].x);
-+    tri[0].y = _simd_fmadd_ps(size, vAdjust0Y, tri[0].y);
-+    tri[1].x = _simd_fmadd_ps(size, vAdjust1X, tri[1].x);
-+    tri[1].y = _simd_fmadd_ps(size, vAdjust1Y, tri[1].y);
-+    tri[2].x = _simd_fmadd_ps(size, vAdjust2X, tri[2].x);
-+    tri[2].y = _simd_fmadd_ps(size, vAdjust2Y, tri[2].y);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Processes attributes for the backend based on linkage mask and
-+///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
-+/// @param pDC - Draw context
-+/// @param pa - Primitive Assembly state
-+/// @param linkageMask - Specifies which VS outputs are routed to PS.
-+/// @param pLinkageMap - maps VS attribute slot to PS slot
-+/// @param triIndex - Triangle to process attributes for
-+/// @param pBuffer - Output result
-+template<uint32_t NumVerts>
-+INLINE void ProcessAttributes(
-+    DRAW_CONTEXT *pDC,
-+    PA_STATE&pa,
-+    uint32_t linkageMask,
-+    const uint8_t* pLinkageMap,
-+    uint32_t triIndex,
-+    float *pBuffer)
-+{
-+    DWORD slot = 0;
-+    uint32_t mapIdx = 0;
-+    while (_BitScanForward(&slot, linkageMask))
-+    {
-+        linkageMask &= ~(1 << slot); // done with this bit.
-+
-+        // compute absolute slot in vertex attrib array
-+        uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx++];
-+
-+        __m128 attrib[3];    // triangle attribs (always 4 wide)
-+        pa.AssembleSingle(inputSlot, triIndex, attrib);
-+
-+        for (uint32_t i = 0; i < NumVerts; ++i)
-+        {
-+            _mm_store_ps(pBuffer, attrib[i]);
-+            pBuffer += 4;
-+        }
-+
-+        // pad out the attrib buffer to 3 verts to ensure the triangle
-+        // interpolation code in the pixel shader works correctly for the
-+        // 3 topologies - point, line, tri.  This effectively zeros out the
-+        // effect of the missing vertices in the triangle interpolation.
-+        for (uint32_t i = NumVerts; i < 3; ++i)
-+        {
-+            _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
-+            pBuffer += 4;
-+        }
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Processes enabled user clip distances. Loads the active clip
-+///        distances from the PA, sets up barycentric equations, and
-+///        stores the results to the output buffer
-+/// @param pa - Primitive Assembly state
-+/// @param primIndex - primitive index to process
-+/// @param clipDistMask - mask of enabled clip distances
-+/// @param pUserClipBuffer - buffer to store results
-+template<uint32_t NumVerts>
-+void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
-+{
-+    DWORD clipDist;
-+    while (_BitScanForward(&clipDist, clipDistMask))
-+    {
-+        clipDistMask &= ~(1 << clipDist);
-+        uint32_t clipSlot = clipDist >> 2;
-+        uint32_t clipComp = clipDist & 0x3;
-+        uint32_t clipAttribSlot = clipSlot == 0 ?
-+            VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
-+
-+        __m128 primClipDist[3];
-+        pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
-+
-+        float vertClipDist[NumVerts];
-+        for (uint32_t e = 0; e < NumVerts; ++e)
-+        {
-+            OSALIGNSIMD(float) aVertClipDist[4];
-+            _mm_store_ps(aVertClipDist, primClipDist[e]);
-+            vertClipDist[e] = aVertClipDist[clipComp];
-+        };
-+
-+        // setup plane equations for barycentric interpolation in the backend
-+        float baryCoeff[NumVerts];
-+        for (uint32_t e = 0; e < NumVerts - 1; ++e)
-+        {
-+            baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
-+        }
-+        baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
-+
-+        for (uint32_t e = 0; e < NumVerts; ++e)
-+        {
-+            *(pUserClipBuffer++) = baryCoeff[e];
-+        }
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
-+///        culling, viewport transform, etc.
-+/// @param pDC - pointer to draw context.
-+/// @param pa - The primitive assembly object.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param tri - Contains triangle position data for SIMDs worth of triangles.
-+/// @param primID - Primitive ID for each triangle.
-+void BinTriangles(
-+    DRAW_CONTEXT *pDC,
-+    PA_STATE& pa,
-+    uint32_t workerId,
-+    simdvector tri[3],
-+    uint32_t triMask,
-+    simdscalari primID)
-+{
-+    RDTSC_START(FEBinTriangles);
-+
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_RASTSTATE& rastState = state.rastState;
-+    const SWR_FRONTEND_STATE& feState = state.frontendState;
-+    const SWR_GS_STATE& gsState = state.gsState;
-+
-+    // Simple wireframe mode for debugging purposes only
-+
-+    simdscalar vRecipW0 = _simd_set1_ps(1.0f);
-+    simdscalar vRecipW1 = _simd_set1_ps(1.0f);
-+    simdscalar vRecipW2 = _simd_set1_ps(1.0f);
-+
-+    if (!feState.vpTransformDisable)
-+    {
-+        // perspective divide
-+        vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
-+        vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
-+        vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
-+
-+        tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
-+        tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
-+        tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
-+
-+        tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
-+        tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
-+        tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
-+
-+        tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
-+        tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
-+        tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
-+
-+        // viewport transform to screen coords
-+        viewportTransform<3>(tri, state.vpMatrix[0]);
-+    }
-+
-+    // bloat points to tri
-+    if (pa.binTopology == TOP_POINT_LIST)
-+    {
-+        if (rastState.pointParam)
-+        {
-+            simdvector size[3];
-+            pa.Assemble(rastState.pointSizeAttrib, size);
-+            ExpandPoint(tri, size[0].x);
-+        } 
-+        else 
-+        {
-+            ExpandPoint(tri, _simd_set1_ps(rastState.pointSize));
-+        }
-+    }
-+
-+    // convert to fixed point
-+    simdscalari vXi[3], vYi[3];
-+    vXi[0] = fpToFixedPointVertical(tri[0].x);
-+    vYi[0] = fpToFixedPointVertical(tri[0].y);
-+    vXi[1] = fpToFixedPointVertical(tri[1].x);
-+    vYi[1] = fpToFixedPointVertical(tri[1].y);
-+    vXi[2] = fpToFixedPointVertical(tri[2].x);
-+    vYi[2] = fpToFixedPointVertical(tri[2].y);
-+
-+    // triangle setup
-+    simdscalari vAi[3], vBi[3];
-+    triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
-+        
-+    // determinant
-+    simdscalari vDet[2];
-+    calcDeterminantIntVertical(vAi, vBi, vDet);
-+
-+    // cull zero area
-+    int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
-+    int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
-+
-+    int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2));
-+
-+    uint32_t origTriMask = triMask;
-+    triMask &= ~cullZeroAreaMask;
-+
-+    // determine front winding tris
-+    // CW  +det
-+    // CCW -det
-+    maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
-+    maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
-+    int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
-+
-+    uint32_t frontWindingTris;
-+    if (rastState.frontWinding == SWR_FRONTWINDING_CW)
-+    {
-+        frontWindingTris = cwTriMask;
-+    }
-+    else
-+    {
-+        frontWindingTris = ~cwTriMask;
-+    }
-+
-+    // cull
-+    uint32_t cullTris;
-+    switch ((SWR_CULLMODE)rastState.cullMode)
-+    {
-+    case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
-+    case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
-+    case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
-+    case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
-+    default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
-+    }
-+
-+    triMask &= ~cullTris;
-+
-+    if (origTriMask ^ triMask)
-+    {
-+        RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
-+    }
-+
-+    // compute per tri backface
-+    uint32_t frontFaceMask = frontWindingTris;
-+
-+    uint32_t *pPrimID = (uint32_t *)&primID;
-+    DWORD triIndex = 0;
-+
-+    if (!triMask)
-+    {
-+        goto endBinTriangles;
-+    }
-+
-+    // Calc bounding box of triangles
-+    simdBBox bbox;
-+    calcBoundingBoxIntVertical(vXi, vYi, bbox);
-+
-+    // determine if triangle falls between pixel centers and discard
-+    // only discard for non-MSAA case
-+    // (left + 127) & ~255
-+    // (right + 128) & ~255
-+
-+    if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
-+    {
-+        origTriMask = triMask;
-+
-+        int cullCenterMask;
-+        {
-+            simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
-+            left = _simd_and_si(left, _simd_set1_epi32(~255));
-+            simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
-+            right = _simd_and_si(right, _simd_set1_epi32(~255));
-+
-+            simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
-+
-+            simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
-+            top = _simd_and_si(top, _simd_set1_epi32(~255));
-+            simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
-+            bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
-+
-+            simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
-+            vMaskV = _simd_or_si(vMaskH, vMaskV);
-+            cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
-+        }
-+
-+        triMask &= ~cullCenterMask;
-+
-+        if(origTriMask ^ triMask)
-+        {
-+            RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
-+        }
-+    }
-+
-+    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
-+    bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
-+    bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
-+    bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
-+    bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-+
-+    // Cull tris completely outside scissor
-+    {
-+        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
-+        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
-+        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
-+        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
-+        triMask = triMask & ~maskOutsideScissor;
-+    }
-+
-+    if (!triMask)
-+    {
-+        goto endBinTriangles;
-+    }
-+
-+    // Convert triangle bbox to macrotile units.
-+    bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-+    bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-+    bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-+    bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-+
-+    OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
-+    _simd_store_si((simdscalari*)aMTLeft, bbox.left);
-+    _simd_store_si((simdscalari*)aMTRight, bbox.right);
-+    _simd_store_si((simdscalari*)aMTTop, bbox.top);
-+    _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-+
-+    // transpose verts needed for backend
-+    /// @todo modify BE to take non-transformed verts
-+    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
-+    vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
-+    vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
-+    vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
-+    vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
-+
-+    // store render target array index
-+    OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-+    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
-+    {
-+        simdvector vRtai[3];
-+        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-+        simdscalari vRtaii;
-+        vRtaii = _simd_castps_si(vRtai[0].x);
-+        _simd_store_si((simdscalari*)aRTAI, vRtaii);
-+    }
-+    else
-+    {
-+        _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
-+    }
-+
-+    // scan remaining valid triangles and bin each separately
-+    while (_BitScanForward(&triIndex, triMask))
-+    {
-+        uint32_t linkageCount = state.linkageCount;
-+        uint32_t linkageMask  = state.linkageMask;
-+        uint32_t numScalarAttribs = linkageCount * 4;
-+        
-+        BE_WORK work;
-+        work.type = DRAW;
-+
-+        TRIANGLE_WORK_DESC &desc = work.desc.tri;
-+
-+        desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
-+        desc.triFlags.primID = pPrimID[triIndex];
-+        desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
-+
-+        work.pfnWork = gRasterizerTable[rastState.sampleCount];
-+
-+        // store active attribs
-+        float *pAttribs = (float*)pDC->arena.AllocAligned(numScalarAttribs*3*sizeof(float), 16);
-+        desc.pAttribs = pAttribs;
-+        desc.numAttribs = linkageCount;
-+        ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
-+
-+        // store triangle vertex data
-+        desc.pTriBuffer = (float*)pDC->arena.AllocAligned(4*4*sizeof(float), 16);
-+
-+        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
-+        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
-+        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
-+        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
-+
-+        // store user clip distances
-+        if (rastState.clipDistanceMask)
-+        {
-+            uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
-+            desc.pUserClipBuffer = (float*)pDC->arena.Alloc(numClipDist * 3 * sizeof(float));
-+            ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
-+        }
-+
-+        MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+        for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
-+        {
-+            for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
-+            {
-+#if KNOB_ENABLE_TOSS_POINTS
-+                if (!KNOB_TOSS_SETUP_TRIS)
-+#endif
-+                {
-+                    pTileMgr->enqueue(x, y, &work);
-+                }
-+            }
-+        }
-+
-+        triMask &= ~(1 << triIndex);
-+    }
-+
-+endBinTriangles:
-+    RDTSC_STOP(FEBinTriangles, 1, 0);
-+}
-+
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Bin SIMD points to the backend.  Only supports point size of 1
-+/// @param pDC - pointer to draw context.
-+/// @param pa - The primitive assembly object.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param tri - Contains point position data for SIMDs worth of points.
-+/// @param primID - Primitive ID for each point.
-+void BinPoints(
-+    DRAW_CONTEXT *pDC,
-+    PA_STATE& pa,
-+    uint32_t workerId,
-+    simdvector prim[3],
-+    uint32_t primMask,
-+    simdscalari primID)
-+{
-+    RDTSC_START(FEBinPoints);
-+
-+    simdvector& primVerts = prim[0];
-+
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_GS_STATE& gsState = state.gsState;
-+
-+    // perspective divide
-+    simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
-+    primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
-+    primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
-+    primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
-+
-+    // viewport transform to screen coords
-+    viewportTransform<1>(&primVerts, state.vpMatrix[0]);
-+
-+    // convert to fixed point
-+    simdscalari vXi, vYi;
-+    vXi = fpToFixedPointVertical(primVerts.x);
-+    vYi = fpToFixedPointVertical(primVerts.y);
-+
-+    // adjust for triangle rasterization rules - ie top-left rule
-+    vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
-+    vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
-+
-+    // cull points off the top-left edge of the viewport
-+    primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
-+    primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
-+
-+    // compute macro tile coordinates 
-+    simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-+    simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-+
-+    OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
-+    _simd_store_si((simdscalari*)aMacroX, macroX);
-+    _simd_store_si((simdscalari*)aMacroY, macroY);
-+
-+    // compute raster tile coordinates
-+    simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-+
-+    // compute raster tile relative x,y for coverage mask
-+    simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
-+    simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
-+
-+    simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
-+    simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
-+
-+    OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
-+    OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
-+    _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
-+    _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
-+
-+    OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
-+    OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
-+    _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
-+    _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
-+
-+    OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
-+    _simd_store_ps((float*)aZ, primVerts.z);
-+
-+    // store render target array index
-+    OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-+    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
-+    {
-+        simdvector vRtai;
-+        pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
-+        simdscalari vRtaii = _simd_castps_si(vRtai.x);
-+        _simd_store_si((simdscalari*)aRTAI, vRtaii);
-+    }
-+    else
-+    {
-+        _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
-+    }
-+
-+    uint32_t *pPrimID = (uint32_t *)&primID;
-+    DWORD primIndex = 0;
-+    // scan remaining valid triangles and bin each separately
-+    while (_BitScanForward(&primIndex, primMask))
-+    {
-+        uint32_t linkageCount = state.linkageCount;
-+        uint32_t linkageMask = state.linkageMask;
-+
-+        uint32_t numScalarAttribs = linkageCount * 4;
-+
-+        BE_WORK work;
-+        work.type = DRAW;
-+
-+        TRIANGLE_WORK_DESC &desc = work.desc.tri;
-+
-+        // points are always front facing
-+        desc.triFlags.frontFacing = 1;
-+        desc.triFlags.primID = pPrimID[primIndex];
-+        desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-+
-+        work.pfnWork = rastPoint;
-+
-+        // store attributes
-+        float *pAttribs = (float*)pDC->arena.AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
-+        desc.pAttribs = pAttribs;
-+        desc.numAttribs = linkageCount;
-+
-+        ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
-+
-+        // store raster tile aligned x, y, perspective correct z
-+        float *pTriBuffer = (float*)pDC->arena.AllocAligned(4 * sizeof(float), 16);
-+        desc.pTriBuffer = pTriBuffer;
-+        *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
-+        *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
-+        *pTriBuffer = aZ[primIndex];
-+
-+        uint32_t tX = aTileRelativeX[primIndex];
-+        uint32_t tY = aTileRelativeY[primIndex];
-+
-+        // pack the relative x,y into the coverageMask, the rasterizer will
-+        // generate the true coverage mask from it
-+        work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
-+
-+        // bin it
-+        MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+#if KNOB_ENABLE_TOSS_POINTS
-+        if (!KNOB_TOSS_SETUP_TRIS)
-+#endif
-+        {
-+            pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
-+        }
-+        primMask &= ~(1 << primIndex);
-+    }
-+
-+    RDTSC_STOP(FEBinPoints, 1, 0);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Bin SIMD lines to the backend.
-+/// @param pDC - pointer to draw context.
-+/// @param pa - The primitive assembly object.
-+/// @param workerId - thread's worker id. Even thread has a unique id.
-+/// @param tri - Contains line position data for SIMDs worth of points.
-+/// @param primID - Primitive ID for each line.
-+void BinLines(
-+    DRAW_CONTEXT *pDC,
-+    PA_STATE& pa,
-+    uint32_t workerId,
-+    simdvector prim[],
-+    uint32_t primMask,
-+    simdscalari primID)
-+{
-+    RDTSC_START(FEBinLines);
-+
-+    const API_STATE& state = GetApiState(pDC);
-+    const SWR_RASTSTATE& rastState = state.rastState;
-+    const SWR_FRONTEND_STATE& feState = state.frontendState;
-+    const SWR_GS_STATE& gsState = state.gsState;
-+
-+    simdscalar vRecipW0 = _simd_set1_ps(1.0f);
-+    simdscalar vRecipW1 = _simd_set1_ps(1.0f);
-+
-+    if (!feState.vpTransformDisable)
-+    {
-+        // perspective divide
-+        vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
-+        vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
-+
-+        prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
-+        prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
-+
-+        prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
-+        prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
-+
-+        prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
-+        prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
-+
-+        // viewport transform to screen coords
-+        viewportTransform<2>(prim, state.vpMatrix[0]);
-+    }
-+
-+    // convert to fixed point
-+    simdscalari vXi[2], vYi[2];
-+    vXi[0] = fpToFixedPointVertical(prim[0].x);
-+    vYi[0] = fpToFixedPointVertical(prim[0].y);
-+    vXi[1] = fpToFixedPointVertical(prim[1].x);
-+    vYi[1] = fpToFixedPointVertical(prim[1].y);
-+
-+    // compute x-major vs y-major mask
-+    simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
-+    simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
-+    simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
-+    uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
-+
-+    // cull zero-length lines
-+    simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
-+    vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
-+
-+    primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
-+
-+    uint32_t *pPrimID = (uint32_t *)&primID;
-+
-+    simdscalar vUnused = _simd_setzero_ps();
-+
-+    // Calc bounding box of lines
-+    simdBBox bbox;
-+    bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
-+    bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
-+    bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
-+    bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
-+
-+    // bloat bbox by line width along minor axis
-+    simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
-+    simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
-+    simdBBox bloatBox;
-+    bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
-+    bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
-+    bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
-+    bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
-+
-+    bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
-+    bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
-+    bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
-+    bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
-+
-+    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
-+    bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
-+    bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
-+    bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
-+    bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-+
-+    // Cull prims completely outside scissor
-+    {
-+        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
-+        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
-+        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
-+        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
-+        primMask = primMask & ~maskOutsideScissor;
-+    }
-+
-+    if (!primMask)
-+    {
-+        goto endBinLines;
-+    }
-+
-+    // Convert triangle bbox to macrotile units.
-+    bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-+    bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-+    bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
-+    bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-+
-+    OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
-+    _simd_store_si((simdscalari*)aMTLeft, bbox.left);
-+    _simd_store_si((simdscalari*)aMTRight, bbox.right);
-+    _simd_store_si((simdscalari*)aMTTop, bbox.top);
-+    _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-+
-+    // transpose verts needed for backend
-+    /// @todo modify BE to take non-transformed verts
-+    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
-+    vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
-+    vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
-+    vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
-+    vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
-+
-+    // store render target array index
-+    OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
-+    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
-+    {
-+        simdvector vRtai[2];
-+        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
-+        simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
-+        _simd_store_si((simdscalari*)aRTAI, vRtaii);
-+    }
-+    else
-+    {
-+        _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
-+    }
-+
-+    // scan remaining valid prims and bin each separately
-+    DWORD primIndex;
-+    while (_BitScanForward(&primIndex, primMask))
-+    {
-+        uint32_t linkageCount = state.linkageCount;
-+        uint32_t linkageMask = state.linkageMask;
-+        uint32_t numScalarAttribs = linkageCount * 4;
-+
-+        BE_WORK work;
-+        work.type = DRAW;
-+
-+        TRIANGLE_WORK_DESC &desc = work.desc.tri;
-+
-+        desc.triFlags.frontFacing = 1;
-+        desc.triFlags.primID = pPrimID[primIndex];
-+        desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
-+        desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-+
-+        work.pfnWork = RasterizeLine;
-+
-+        // store active attribs
-+        desc.pAttribs = (float*)pDC->arena.AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-+        desc.numAttribs = linkageCount;
-+        ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
-+
-+        // store line vertex data
-+        desc.pTriBuffer = (float*)pDC->arena.AllocAligned(4 * 4 * sizeof(float), 16);
-+        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
-+        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
-+        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
-+        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
-+
-+        // store user clip distances
-+        if (rastState.clipDistanceMask)
-+        {
-+            uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
-+            desc.pUserClipBuffer = (float*)pDC->arena.Alloc(numClipDist * 2 * sizeof(float));
-+            ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
-+        }
-+
-+        MacroTileMgr *pTileMgr = pDC->pTileMgr;
-+        for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
-+        {
-+            for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
-+            {
-+#if KNOB_ENABLE_TOSS_POINTS
-+                if (!KNOB_TOSS_SETUP_TRIS)
-+#endif
-+                {
-+                    pTileMgr->enqueue(x, y, &work);
-+                }
-+            }
-+        }
-+
-+        primMask &= ~(1 << primIndex);
-+    }
-+
-+endBinLines:
-+
-+    RDTSC_STOP(FEBinLines, 1, 0);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
-new file mode 100644
-index 0000000..e8452c3
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
-@@ -0,0 +1,326 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file frontend.h
-+*
-+* @brief Definitions for Frontend which handles vertex processing,
-+*        primitive assembly, clipping, binning, etc.
-+*
-+******************************************************************************/
-+#pragma once
-+#include "context.h"
-+
-+INLINE
-+__m128i fpToFixedPoint(const __m128 vIn)
-+{
-+    __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
-+    return _mm_cvtps_epi32(vFixed);
-+}
-+
-+INLINE
-+simdscalari fpToFixedPointVertical(const simdscalar vIn)
-+{
-+    simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(FIXED_POINT_SCALE));
-+    return _simd_cvtps_epi32(vFixed);
-+}
-+
-+
-+// Calculates the A and B coefficients for the 3 edges of the triangle
-+// 
-+// maths for edge equations:
-+//   standard form of a line in 2d
-+//   Ax + By + C = 0
-+//   A = y0 - y1
-+//   B = x1 - x0
-+//   C = x0y1 - x1y0
-+INLINE
-+void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
-+{
-+    // vYsub = y1 y2 y0 dc
-+    __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
-+    // vY =    y0 y1 y2 dc
-+    vA = _mm_sub_ps(vY, vYsub);
-+
-+    // Result: 
-+    // A[0] = y0 - y1
-+    // A[1] = y1 - y2
-+    // A[2] = y2 - y0
-+
-+    // vXsub = x1 x2 x0 dc
-+    __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
-+    // vX =    x0 x1 x2 dc
-+    vB = _mm_sub_ps(vXsub, vX);
-+
-+    // Result: 
-+    // B[0] = x1 - x0
-+    // B[1] = x2 - x1
-+    // B[2] = x0 - x2
-+}
-+
-+INLINE
-+void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
-+{
-+    // generate edge equations
-+    // A = y0 - y1
-+    // B = x1 - x0
-+    vA[0] = _simd_sub_ps(vY[0], vY[1]);
-+    vA[1] = _simd_sub_ps(vY[1], vY[2]);
-+    vA[2] = _simd_sub_ps(vY[2], vY[0]);
-+
-+    vB[0] = _simd_sub_ps(vX[1], vX[0]);
-+    vB[1] = _simd_sub_ps(vX[2], vX[1]);
-+    vB[2] = _simd_sub_ps(vX[0], vX[2]);
-+}
-+
-+INLINE
-+void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
-+{
-+    // generate edge equations
-+    // A = y0 - y1
-+    // B = x1 - x0
-+    // C = x0y1 - x1y0
-+    __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
-+    vA = _mm_sub_epi32(vY, vYsub);
-+
-+    __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
-+    vB = _mm_sub_epi32(vXsub, vX);
-+}
-+
-+INLINE
-+void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
-+{
-+    // A = y0 - y1
-+    // B = x1 - x0
-+    vA[0] = _simd_sub_epi32(vY[0], vY[1]);
-+    vA[1] = _simd_sub_epi32(vY[1], vY[2]);
-+    vA[2] = _simd_sub_epi32(vY[2], vY[0]);
-+
-+    vB[0] = _simd_sub_epi32(vX[1], vX[0]);
-+    vB[1] = _simd_sub_epi32(vX[2], vX[1]);
-+    vB[2] = _simd_sub_epi32(vX[0], vX[2]);
-+}
-+// Calculate the determinant of the triangle
-+// 2 vectors between the 3 points: P, Q
-+// Px = x0-x2, Py = y0-y2
-+// Qx = x1-x2, Qy = y1-y2
-+//       |Px Qx|
-+// det = |     | = PxQy - PyQx 
-+//       |Py Qy|
-+// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
-+//               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
-+//               : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
-+//               : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
-+//               : B[2]*A[1] - A[2]*B[1]
-+INLINE
-+float calcDeterminantInt(const __m128i vA, const __m128i vB)
-+{
-+    // vAShuf = [A1, A0, A2, A0]
-+    __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
-+    // vBShuf = [B2, B0, B1, B0]
-+    __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
-+    // vMul = [A1*B2, B1*A2]
-+    __m128i vMul   = _mm_mul_epi32(vAShuf, vBShuf);
-+
-+    // shuffle upper to lower
-+    // vMul2 = [B1*A2, B1*A2]
-+    __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
-+    //vMul = [A1*B2 - B1*A2]
-+    vMul = _mm_sub_epi64(vMul, vMul2);
-+
-+	// According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
-+    OSALIGN(int64_t, 16) result;
-+    _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
-+
-+    double fResult = (double)result;
-+    fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
-+
-+    return (float)fResult;
-+}
-+
-+INLINE
-+void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
-+{
-+    // refer to calcDeterminantInt comment for calculation explanation
-+    // A1*B2
-+    simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]);     // 0 0 1 1 4 4 5 5
-+    simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]);     // 2 2 3 3 6 6 7 7
-+
-+    simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
-+    simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
-+
-+    simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo);        // 0 1 4 5
-+    simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi);        // 2 3 6 7
-+
-+    // B1*A2
-+    simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
-+    simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
-+
-+    simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
-+    simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
-+
-+    simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
-+    simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
-+
-+    // A1*B2 - A2*B1
-+    simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
-+    simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
-+
-+    // shuffle 0 1 4 5 -> 0 1 2 3
-+    simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20);
-+    simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31);
-+
-+    pvDet[0] = vResultLo;
-+    pvDet[1] = vResultHi;
-+}
-+
-+INLINE
-+void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
-+{
-+    // C = -Ax - By
-+    vC  = _mm_mul_ps(vA, vX);
-+    __m128 vCy = _mm_mul_ps(vB, vY);    
-+    vC  = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
-+    vC  = _mm_sub_ps(vC, vCy);
-+}
-+
-+INLINE
-+void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
-+{
-+    vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
-+    vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
-+
-+    vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
-+    vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
-+
-+    vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
-+    vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
-+}
-+
-+template<uint32_t NumVerts>
-+INLINE
-+void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRIX & vpMatrix)
-+{
-+    simdscalar m00 = _simd_load1_ps(&vpMatrix.m00);
-+    simdscalar m30 = _simd_load1_ps(&vpMatrix.m30);
-+    simdscalar m11 = _simd_load1_ps(&vpMatrix.m11);
-+    simdscalar m31 = _simd_load1_ps(&vpMatrix.m31);
-+    simdscalar m22 = _simd_load1_ps(&vpMatrix.m22);
-+    simdscalar m32 = _simd_load1_ps(&vpMatrix.m32);
-+
-+    for (uint32_t i = 0; i < NumVerts; ++i)
-+    {
-+        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
-+        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
-+        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
-+    }
-+}
-+
-+INLINE
-+void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox)
-+{
-+    // Need horizontal fp min here
-+    __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
-+    __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
-+
-+    __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
-+    __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
-+
-+
-+    __m128i vMinX = _mm_min_epi32(vX, vX1);
-+            vMinX = _mm_min_epi32(vMinX, vX2);
-+
-+    __m128i vMaxX = _mm_max_epi32(vX, vX1);
-+            vMaxX = _mm_max_epi32(vMaxX, vX2);
-+
-+    __m128i vMinY = _mm_min_epi32(vY, vY1);
-+            vMinY = _mm_min_epi32(vMinY, vY2);
-+
-+    __m128i vMaxY = _mm_max_epi32(vY, vY1);
-+            vMaxY = _mm_max_epi32(vMaxY, vY2);
-+
-+    bbox.left = _mm_extract_epi32(vMinX, 0);
-+    bbox.right = _mm_extract_epi32(vMaxX, 0);
-+    bbox.top = _mm_extract_epi32(vMinY, 0);
-+    bbox.bottom = _mm_extract_epi32(vMaxY, 0);
-+
-+#if 0
-+    Jacob:  A = _mm_shuffle_ps(X, Y, 0 0 0 0)
-+B = _mm_shuffle_ps(Z, W, 0 0 0 0)
-+A = _mm_shuffle_epi32(A, 3 0 3 0)
-+A = _mm_shuffle_ps(A, B, 1 0 1 0)
-+#endif
-+
-+}
-+
-+INLINE
-+void calcBoundingBoxIntVertical(const simdscalari (&vX)[3], const simdscalari (&vY)[3], simdBBox &bbox)
-+{
-+    simdscalari vMinX = vX[0];
-+    vMinX = _simd_min_epi32(vMinX, vX[1]);
-+    vMinX = _simd_min_epi32(vMinX, vX[2]);
-+
-+    simdscalari vMaxX = vX[0];
-+    vMaxX = _simd_max_epi32(vMaxX, vX[1]);
-+    vMaxX = _simd_max_epi32(vMaxX, vX[2]);
-+
-+    simdscalari vMinY = vY[0];
-+    vMinY = _simd_min_epi32(vMinY, vY[1]);
-+    vMinY = _simd_min_epi32(vMinY, vY[2]);
-+
-+    simdscalari vMaxY = vY[0];
-+    vMaxY = _simd_max_epi32(vMaxY, vY[1]);
-+    vMaxY = _simd_max_epi32(vMaxY, vY[2]);
-+
-+    bbox.left = vMinX;
-+    bbox.right = vMaxX;
-+    bbox.top = vMinY;
-+    bbox.bottom = vMaxY;
-+}
-+
-+INLINE
-+bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
-+{
-+    const API_STATE& state = GetApiState(pDC);
-+
-+    return (state.rastState.pointSize == 1.0f &&
-+            !state.rastState.pointParam &&
-+            !state.rastState.pointSpriteEnable);
-+}
-+
-+uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
-+uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
-+
-+// Templated Draw front-end function.  All combinations of template parameter values are available
-+template <bool IsIndexedT, bool HasTessellationT, bool HasGeometryShaderT, bool HasStreamOutT, bool HasRastT>
-+void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+
-+void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-+
-+struct PA_STATE_BASE;  // forward decl
-+void BinTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t primMask, simdscalari primID);
-+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
-+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
-new file mode 100644
-index 0000000..6140790
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
-@@ -0,0 +1,139 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file knobs.h
-+*
-+* @brief Static (Compile-Time) Knobs for Core.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include <stdint.h>
-+#include <gen_knobs.h>
-+
-+#define KNOB_ARCH_AVX    0
-+#define KNOB_ARCH_AVX2   1
-+#define KNOB_ARCH_AVX512 2
-+
-+///////////////////////////////////////////////////////////////////////////////
-+// Architecture validation
-+///////////////////////////////////////////////////////////////////////////////
-+#if !defined(KNOB_ARCH)
-+#define KNOB_ARCH KNOB_ARCH_AVX
-+#endif
-+
-+#if (KNOB_ARCH == KNOB_ARCH_AVX)
-+#define KNOB_ARCH_ISA AVX
-+#define KNOB_ARCH_STR "AVX"
-+#define KNOB_SIMD_WIDTH 8
-+#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
-+#define KNOB_ARCH_ISA AVX2
-+#define KNOB_ARCH_STR "AVX2"
-+#define KNOB_SIMD_WIDTH 8
-+#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
-+#define KNOB_ARCH_ISA AVX512F
-+#define KNOB_ARCH_STR "AVX512"
-+#define KNOB_SIMD_WIDTH 16
-+#error "AVX512 not yet supported"
-+#else
-+#error "Unknown architecture"
-+#endif
-+
-+#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
-+
-+///////////////////////////////////////////////////////////////////////////////
-+// Configuration knobs
-+///////////////////////////////////////////////////////////////////////////////
-+#define KNOB_MAX_NUM_THREADS                256 // Supports up to dual-HSW-Xeon.
-+
-+// Maximum supported number of active vertex buffer streams
-+#define KNOB_NUM_STREAMS                    32
-+
-+// Maximum supported number of attributes per vertex
-+#define KNOB_NUM_ATTRIBUTES                 37
-+
-+// Maximum supported active viewports and scissors
-+#define KNOB_NUM_VIEWPORTS_SCISSORS         16
-+
-+// Guardband range used by the clipper
-+#define KNOB_GUARDBAND_WIDTH                4096.0f
-+#define KNOB_GUARDBAND_HEIGHT               2048.0f
-+
-+///////////////////////////////
-+// Macro tile configuration
-+///////////////////////////////
-+
-+// raster tile dimensions
-+#define KNOB_TILE_X_DIM                      8
-+#define KNOB_TILE_X_DIM_SHIFT                3
-+#define KNOB_TILE_Y_DIM                      8
-+#define KNOB_TILE_Y_DIM_SHIFT                3
-+
-+// fixed macrotile pixel dimension for now, eventually will be 
-+// dynamically set based on tile format and pixel size
-+#define KNOB_MACROTILE_X_DIM                64
-+#define KNOB_MACROTILE_Y_DIM                64
-+#define KNOB_MACROTILE_X_DIM_FIXED          (KNOB_MACROTILE_X_DIM << 8)
-+#define KNOB_MACROTILE_Y_DIM_FIXED          (KNOB_MACROTILE_Y_DIM << 8)
-+#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT    14
-+#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT    14
-+#define KNOB_MACROTILE_X_DIM_IN_TILES       (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
-+#define KNOB_MACROTILE_Y_DIM_IN_TILES       (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
-+
-+// total # of hot tiles available. This should be enough to
-+// fully render a 16kx16k 128bpp render target
-+#define KNOB_NUM_HOT_TILES_X                 256
-+#define KNOB_NUM_HOT_TILES_Y                 256
-+#define KNOB_COLOR_HOT_TILE_FORMAT           R32G32B32A32_FLOAT
-+#define KNOB_DEPTH_HOT_TILE_FORMAT           R32_FLOAT
-+#define KNOB_STENCIL_HOT_TILE_FORMAT         R8_UINT
-+
-+#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4
-+#error "incompatible width/tile dimensions"
-+#endif
-+
-+#if KNOB_SIMD_WIDTH == 8
-+#define SIMD_TILE_X_DIM 4
-+#define SIMD_TILE_Y_DIM 2
-+#else
-+#error "Invalid simd width"
-+#endif
-+
-+///////////////////////////////////////////////////////////////////////////////
-+// Optimization knobs
-+///////////////////////////////////////////////////////////////////////////////
-+#define KNOB_USE_FAST_SRGB                     TRUE
-+
-+// enables cut-aware primitive assembler
-+#define KNOB_ENABLE_CUT_AWARE_PA               TRUE
-+
-+///////////////////////////////////////////////////////////////////////////////
-+// Debug knobs
-+///////////////////////////////////////////////////////////////////////////////
-+//#define KNOB_ENABLE_RDTSC
-+//#define KNOB_SWRC_TRACING
-+
-+// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
-+#if !defined(KNOB_ENABLE_TOSS_POINTS)
-+#define KNOB_ENABLE_TOSS_POINTS                 0
-+#endif
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
-new file mode 100644
-index 0000000..3f19555
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
-@@ -0,0 +1,98 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file knobs_init.h
-+*
-+* @brief Dynamic Knobs Initialization for Core.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include <core/knobs.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <ctype.h>
-+#include <stdio.h>
-+
-+// Assume the type is compatible with a 32-bit integer
-+template <typename T>
-+static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
-+{
-+    uint32_t value = 0;
-+    if (sscanf(pOverride, "%u", &value))
-+    {
-+        knobValue = static_cast<T>(value);
-+    }
-+}
-+
-+static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
-+{
-+    size_t len = strlen(pOverride);
-+    if (len == 1)
-+    {
-+        auto c = tolower(pOverride[0]);
-+        if (c == 'y' || c == 't' || c == '1')
-+        {
-+            knobValue = true;
-+            return;
-+        }
-+        if (c == 'n' || c == 'f' || c == '0')
-+        {
-+            knobValue = false;
-+            return;
-+        }
-+    }
-+
-+    // Try converting to a number and casting to bool
-+    uint32_t value = 0;
-+    if (sscanf(pOverride, "%u", &value))
-+    {
-+        knobValue = value != 0;
-+        return;
-+    }
-+}
-+
-+static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
-+{
-+    float value = knobValue;
-+    if (sscanf(pOverride, "%f", &value))
-+    {
-+        knobValue = value;
-+    }
-+}
-+
-+template <typename T>
-+static inline void InitKnob(T& knob)
-+{
-+
-+    // TODO, read registry first
-+
-+    // Second, read environment variables
-+    const char* pOverride = getenv(knob.Name());
-+
-+    if (pOverride)
-+    {
-+        auto knobValue = knob.Value();
-+        ConvertEnvToKnob(pOverride, knobValue);
-+        knob.Value(knobValue);
-+    }
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h
-new file mode 100644
-index 0000000..f7d5263
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h
-@@ -0,0 +1,562 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file multisample.h
-+*
-+******************************************************************************/
-+
-+#pragma once
-+
-+#include "context.h"
-+#include "format_traits.h"
-+
-+INLINE
-+uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount)
-+{
-+    static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_MAX] {1, 2, 4, 8, 16};
-+    assert(sampleCount < SWR_MULTISAMPLE_TYPE_MAX);
-+    return sampleCountLUT[sampleCount];
-+}
-+
-+INLINE
-+SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
-+{
-+    switch(numSamples)
-+    {
-+    case 1: return SWR_MULTISAMPLE_1X;
-+    case 2: return SWR_MULTISAMPLE_2X;
-+    case 4: return SWR_MULTISAMPLE_4X;
-+    case 8: return SWR_MULTISAMPLE_8X;
-+    case 16: return SWR_MULTISAMPLE_16X;
-+    default: assert(0); return SWR_MULTISAMPLE_1X;
-+    }
-+}
-+
-+// hardcoded offsets based on Direct3d standard multisample positions
-+// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
-+// coords are 0.8 fixed point offsets from (0, 0)
-+template<SWR_MULTISAMPLE_COUNT sampleCount>
-+struct MultisampleTraits
-+{
-+    INLINE static __m128i vXi(uint32_t sampleNum) = delete;
-+    INLINE static __m128i vYi(uint32_t sampleNum) = delete;
-+    INLINE static simdscalar vX(uint32_t sampleNum) = delete;
-+    INLINE static simdscalar vY(uint32_t sampleNum) = delete;
-+    INLINE static __m128i TileSampleOffsetsX() = delete;
-+    INLINE static __m128i TileSampleOffsetsY() = delete;
-+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete;
-+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete;
-+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete;
-+
-+    static const uint32_t numSamples = 0;
-+    static const uint32_t sampleMask = 0;
-+};
-+
-+template<>
-+struct MultisampleTraits<SWR_MULTISAMPLE_1X>
-+{
-+    INLINE static __m128i vXi(uint32_t sampleNum)
-+    {
-+        static const __m128i X = _mm_set1_epi32(0x80);
-+        return X;
-+    }
-+
-+    INLINE static __m128i vYi(uint32_t sampleNum)
-+    {
-+        static const __m128i Y = _mm_set1_epi32(0x80);
-+        return Y;
-+    }
-+
-+    INLINE static simdscalar vX(uint32_t sampleNum)
-+    {
-+        static const simdscalar X = _simd_set1_ps(0.5f);
-+        return X;
-+    }
-+
-+    INLINE static simdscalar vY(uint32_t sampleNum)
-+    {
-+        static const simdscalar Y = _simd_set1_ps(0.5f);
-+        return Y;
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsX()
-+    {
-+        static const uint32_t bboxLeftEdge = 0x80;
-+        static const uint32_t bboxRightEdge = 0x80;
-+                                                            // BR,            BL,           UR,            UL
-+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
-+        return tileSampleOffsetX;
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsY()
-+    {
-+        static const uint32_t bboxTopEdge = 0x80;
-+        static const uint32_t bboxBottomEdge = 0x80;
-+                                                            // BR,             BL,             UR,          UL
-+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
-+        return tileSampleOffsetY;
-+    }
-+
-+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-+    {
-+        return 0;
-+    }
-+
-+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-+    {
-+        return 0;
-+    }
-+
-+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-+    {
-+        return 0;
-+    }
-+
-+    static const uint32_t numSamples = 1;
-+    static const uint32_t sampleMask = 1;
-+};
-+
-+template<>
-+struct MultisampleTraits<SWR_MULTISAMPLE_2X>
-+{
-+    INLINE static __m128i vXi(uint32_t sampleNum)
-+    {
-+        static const __m128i X[numSamples] {_mm_set1_epi32(0xC0), _mm_set1_epi32(0x40)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static __m128i vYi(uint32_t sampleNum)
-+    {
-+        static const __m128i Y[numSamples] {_mm_set1_epi32(0xC0), _mm_set1_epi32(0x40)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vX(uint32_t sampleNum)
-+    {
-+        static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
-+        assert(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vY(uint32_t sampleNum)
-+    {
-+        static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
-+        assert(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsX()
-+    {
-+        static const uint32_t bboxLeftEdge = 0x40;
-+        static const uint32_t bboxRightEdge = 0xC0;
-+                                                            // BR,            BL,           UR,            UL
-+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
-+        return tileSampleOffsetX;
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsY()
-+    {
-+        static const uint32_t bboxTopEdge = 0x40;
-+        static const uint32_t bboxBottomEdge = 0xC0;
-+                                                            // BR,             BL,             UR,          UL
-+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
-+        return tileSampleOffsetY;
-+    }
-+
-+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileColorOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileColorOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileDepthOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileDepthOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileStencilOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileStencilOffsets[sampleNum];
-+    }
-+
-+    static const uint32_t numSamples = 2;
-+    static const uint32_t sampleMask = 0x3;
-+};
-+
-+template<>
-+struct MultisampleTraits<SWR_MULTISAMPLE_4X>
-+{
-+    INLINE static __m128i vXi(uint32_t sampleNum)
-+    {
-+        static const __m128i X[numSamples]
-+        {_mm_set1_epi32(0x60), _mm_set1_epi32(0xE0), _mm_set1_epi32(0x20), _mm_set1_epi32(0xA0)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static __m128i vYi(uint32_t sampleNum)
-+    {
-+        static const __m128i Y[numSamples]
-+        {_mm_set1_epi32(0x20), _mm_set1_epi32(0x60), _mm_set1_epi32(0xA0), _mm_set1_epi32(0xE0)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vX(uint32_t sampleNum)
-+    {
-+        static const simdscalar X[numSamples] 
-+        {_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)};
-+        assert(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vY(uint32_t sampleNum)
-+    {
-+        static const simdscalar Y[numSamples]
-+        {_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)};
-+        assert(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsX()
-+    {
-+        static const uint32_t bboxLeftEdge = 0x20;
-+        static const uint32_t bboxRightEdge = 0xE0;
-+                                                            // BR,            BL,           UR,            UL
-+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
-+        return tileSampleOffsetX;
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsY()
-+    {
-+        static const uint32_t bboxTopEdge = 0x20;
-+        static const uint32_t bboxBottomEdge = 0xE0;
-+                                                            // BR,             BL,             UR,          UL
-+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
-+        return tileSampleOffsetY;
-+    }
-+
-+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileColorOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileColorOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileDepthOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileDepthOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileStencilOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileStencilOffsets[sampleNum];
-+    }
-+
-+    static const uint32_t numSamples = 4;
-+    static const uint32_t sampleMask = 0xF;
-+};
-+
-+template<>
-+struct MultisampleTraits<SWR_MULTISAMPLE_8X>
-+{
-+    INLINE static __m128i vXi(uint32_t sampleNum)
-+    {
-+        static const __m128i X[numSamples]
-+        {_mm_set1_epi32(0x90), _mm_set1_epi32(0x70), _mm_set1_epi32(0xD0), _mm_set1_epi32(0x50), 
-+         _mm_set1_epi32(0x30), _mm_set1_epi32(0x10), _mm_set1_epi32(0xB0), _mm_set1_epi32(0xF0)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static __m128i vYi(uint32_t sampleNum)
-+    {
-+        static const __m128i Y[numSamples]
-+        {_mm_set1_epi32(0x50), _mm_set1_epi32(0xB0), _mm_set1_epi32(0x90), _mm_set1_epi32(0x30), 
-+         _mm_set1_epi32(0xD0), _mm_set1_epi32(0x70), _mm_set1_epi32(0xF0), _mm_set1_epi32(0x10)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vX(uint32_t sampleNum)
-+    {
-+        static const simdscalar X[numSamples]
-+        {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125),
-+         _simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)};
-+        assert(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vY(uint32_t sampleNum)
-+    {
-+        static const simdscalar Y[numSamples]
-+        {_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875),
-+         _simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)};
-+        assert(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsX()
-+    {
-+        static const uint32_t bboxLeftEdge = 0x10;
-+        static const uint32_t bboxRightEdge = 0xF0;
-+                                                            // BR,            BL,           UR,            UL
-+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
-+        return tileSampleOffsetX;
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsY()
-+    {
-+        static const uint32_t bboxTopEdge = 0x10;
-+        static const uint32_t bboxBottomEdge = 0xF0;
-+                                                            // BR,             BL,             UR,          UL
-+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
-+        return tileSampleOffsetY;
-+    }
-+
-+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileColorOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileColorOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileDepthOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileDepthOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileStencilOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileStencilOffsets[sampleNum];
-+    }
-+
-+    static const uint32_t numSamples = 8;
-+    static const uint32_t sampleMask = 0xFF;
-+};
-+
-+template<>
-+struct MultisampleTraits<SWR_MULTISAMPLE_16X>
-+{
-+    INLINE static __m128i vXi(uint32_t sampleNum)
-+    {
-+        static const __m128i X[numSamples]
-+        {_mm_set1_epi32(0x90), _mm_set1_epi32(0x70), _mm_set1_epi32(0x50), _mm_set1_epi32(0xC0), 
-+         _mm_set1_epi32(0x30), _mm_set1_epi32(0xA0), _mm_set1_epi32(0xD0), _mm_set1_epi32(0xB0), 
-+         _mm_set1_epi32(0x60), _mm_set1_epi32(0x80), _mm_set1_epi32(0x40), _mm_set1_epi32(0x20), 
-+         _mm_set1_epi32(0x00), _mm_set1_epi32(0xF0), _mm_set1_epi32(0xE0), _mm_set1_epi32(0x10)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static __m128i vYi(uint32_t sampleNum)
-+    {
-+        static const __m128i Y[numSamples]
-+        {_mm_set1_epi32(0x90), _mm_set1_epi32(0x50), _mm_set1_epi32(0xA0), _mm_set1_epi32(0x70), 
-+         _mm_set1_epi32(0x60), _mm_set1_epi32(0xD0), _mm_set1_epi32(0xB0), _mm_set1_epi32(0x30), 
-+         _mm_set1_epi32(0xE0), _mm_set1_epi32(0x10), _mm_set1_epi32(0x20), _mm_set1_epi32(0xC0), 
-+         _mm_set1_epi32(0x80), _mm_set1_epi32(0x40), _mm_set1_epi32(0xF0), _mm_set1_epi32(0x00)};
-+        SWR_ASSERT(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vX(uint32_t sampleNum)
-+    {
-+        static const simdscalar X[numSamples]
-+        {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500),
-+         _simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875),
-+         _simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250),
-+         _simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)};
-+        assert(sampleNum < numSamples);
-+        return X[sampleNum];
-+    }
-+
-+    INLINE static simdscalar vY(uint32_t sampleNum)
-+    {
-+        static const simdscalar Y[numSamples]
-+        {_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375),
-+         _simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875),
-+         _simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500),
-+         _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)};
-+        assert(sampleNum < numSamples);
-+        return Y[sampleNum];
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsX()
-+    {
-+        static const uint32_t bboxLeftEdge = 0x00;
-+        static const uint32_t bboxRightEdge = 0xF0;
-+                                                            // BR,            BL,           UR,            UL
-+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
-+        return tileSampleOffsetX;
-+    }
-+
-+    INLINE static __m128i TileSampleOffsetsY()
-+    {
-+        static const uint32_t bboxTopEdge = 0x00;
-+        static const uint32_t bboxBottomEdge = 0xF0;
-+                                                            // BR,             BL,             UR,          UL
-+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
-+        return tileSampleOffsetY;
-+    }
-+
-+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileColorOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileColorOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileDepthOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileDepthOffsets[sampleNum];
-+    }
-+
-+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-+    {
-+        static const uint32_t RasterTileStencilOffsets[numSamples]
-+        { 0,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
-+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
-+        };
-+        assert(sampleNum < numSamples);
-+        return RasterTileStencilOffsets[sampleNum];
-+    }
-+
-+    static const uint32_t numSamples = 16;
-+    static const uint32_t sampleMask = 0xFFFF;
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
-new file mode 100644
-index 0000000..52ea820
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
-@@ -0,0 +1,1205 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file pa.h
-+*
-+* @brief Definitions for primitive assembly.
-+*        N primitives are assembled at a time, where N is the SIMD width.
-+*        A state machine, that is specific for a given topology, drives the
-+*        assembly of vertices into triangles.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "frontend.h"
-+
-+struct PA_STATE
-+{
-+    DRAW_CONTEXT *pDC;              // draw context
-+    uint8_t* pStreamBase;           // vertex stream
-+    uint32_t streamSizeInVerts;     // total size of the input stream in verts
-+
-+    // The topology the binner will use. In some cases the FE changes the topology from the api state.
-+    PRIMITIVE_TOPOLOGY binTopology;
-+
-+    PA_STATE() {}
-+    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
-+        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
-+
-+    virtual bool HasWork() = 0;
-+    virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
-+    virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
-+    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
-+    virtual bool NextPrim() = 0;
-+    virtual simdvertex& GetNextVsOutput() = 0;
-+    virtual bool GetNextStreamOutput() = 0;
-+    virtual simdmask& GetNextVsIndices() = 0;
-+    virtual uint32_t NumPrims() = 0;
-+    virtual void Reset() = 0;
-+    virtual simdscalari GetPrimID(uint32_t startID) = 0;
-+};
-+
-+// The Optimized PA is a state machine that assembles triangles from vertex shader simd
-+// output. Here is the sequence
-+//    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
-+//    2. Execute PA function to assemble and bin triangles.
-+//        a.    The PA function is a set of functions that collectively make up the
-+//            state machine for a given topology.
-+//                1.    We use a state index to track which PA function to call.
-+//        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
-+//                1.    We call this the current and previous simd vertex.
-+//                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
-+//                    order to assemble the second triangle, for a triangle list, we'll need the
-+//                    last vertex from the previous simd and the first 2 vertices from the current simd.
-+//                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
-+//
-+// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
-+// cuts
-+struct PA_STATE_OPT : public PA_STATE
-+{
-+    simdvertex leadingVertex;           // For tri-fan
-+    uint32_t numPrims;              // Total number of primitives for draw.
-+    uint32_t numPrimsComplete;      // Total number of complete primitives.
-+
-+    uint32_t numSimdPrims;          // Number of prims in current simd.
-+
-+    uint32_t cur;                   // index to current VS output.
-+    uint32_t prev;                  // index to prev VS output. Not really needed in the state.
-+    uint32_t first;                 // index to first VS output. Used for trifan.
-+
-+    uint32_t counter;               // state counter
-+    bool reset;                     // reset state
-+
-+    uint32_t primIDIncr;            // how much to increment for each vector (typically vector / {1, 2})
-+    simdscalari primID;
-+
-+    typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
-+    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+
-+    PFN_PA_FUNC        pfnPaFunc;        // PA state machine function for assembling 4 triangles.
-+    PFN_PA_SINGLE_FUNC pfnPaSingleFunc;  // PA state machine function for assembling single triangle.
-+
-+    // state used to advance the PA when Next is called
-+    PFN_PA_FUNC        pfnPaNextFunc;
-+    uint32_t           nextNumSimdPrims;
-+    uint32_t           nextNumPrimsIncrement;
-+    bool               nextReset;
-+    bool               isStreaming;
-+
-+    simdmask tmpIndices;             // temporary index store for unused virtual function
-+    
-+    PA_STATE_OPT() {}
-+    PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
-+        bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
-+
-+    bool HasWork()
-+    {
-+        return (this->numPrimsComplete < this->numPrims) ? true : false;
-+    }
-+
-+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
-+    {
-+        simdvertex* pVertex = (simdvertex*)pStreamBase;
-+        return pVertex[index].attrib[slot];
-+    }
-+
-+    // Assembles 4 triangles. Each simdvector is a single vertex from 4
-+    // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
-+    bool Assemble(uint32_t slot, simdvector verts[])
-+    {
-+        return this->pfnPaFunc(*this, slot, verts);
-+    }
-+
-+    // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
-+    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
-+    {
-+        return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
-+    }
-+
-+    bool NextPrim()
-+    {
-+        this->pfnPaFunc = this->pfnPaNextFunc;
-+        this->numSimdPrims = this->nextNumSimdPrims;
-+        this->numPrimsComplete += this->nextNumPrimsIncrement;
-+        this->reset = this->nextReset;
-+
-+        if (this->isStreaming)
-+        {
-+            this->reset = false;
-+        }
-+
-+        bool morePrims = false;
-+
-+        if (this->numSimdPrims > 0)
-+        {
-+            morePrims = true;
-+            this->numSimdPrims--;
-+        }
-+        else
-+        {
-+            this->counter = (this->reset) ? 0 : (this->counter + 1);
-+            this->reset = false;
-+        }
-+
-+        this->pfnPaFunc = this->pfnPaNextFunc;
-+
-+        if (!HasWork())
-+        {
-+            morePrims = false;    // no more to do
-+        }
-+
-+        return morePrims;
-+    }
-+
-+    simdvertex& GetNextVsOutput()
-+    {
-+        // increment cur and prev indices
-+        const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
-+        this->prev = this->cur;  // prev is undefined for first state.
-+        this->cur = this->counter % numSimdVerts;
-+
-+        simdvertex* pVertex = (simdvertex*)pStreamBase;
-+        return pVertex[this->cur];
-+    }
-+    
-+    simdmask& GetNextVsIndices()
-+    {
-+        // unused in optimized PA, pass tmp buffer back
-+        return tmpIndices;
-+    }
-+
-+    bool GetNextStreamOutput()
-+    {
-+        this->prev = this->cur;
-+        this->cur = this->counter;
-+
-+        return HasWork();
-+    }
-+
-+    uint32_t NumPrims()
-+    {
-+        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
-+            (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
-+    }
-+
-+    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-+        PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-+        uint32_t numSimdPrims = 0,
-+        uint32_t numPrimsIncrement = 0,
-+        bool reset = false)
-+    {
-+        this->pfnPaNextFunc = pfnPaNextFunc;
-+        this->nextNumSimdPrims = numSimdPrims;
-+        this->nextNumPrimsIncrement = numPrimsIncrement;
-+        this->nextReset = reset;
-+
-+        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
-+    }
-+
-+    void Reset()
-+    {
-+        this->numPrimsComplete = 0;
-+        this->numSimdPrims = 0;
-+        this->cur = 0;
-+        this->prev = 0;
-+        this->first = 0;
-+        this->counter = 0;
-+        this->reset = false;
-+    }
-+
-+    simdscalari GetPrimID(uint32_t startID)
-+    {
-+        return _simd_add_epi32(this->primID,
-+            _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
-+    }
-+};
-+
-+// helper C wrappers to avoid having to rewrite all the PA topology state functions
-+INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-+    PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-+    uint32_t numSimdPrims = 0,
-+    uint32_t numPrimsIncrement = 0,
-+    bool reset = false)
-+{
-+    return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
-+}
-+INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
-+{
-+    return pa.GetSimdVector(index, slot);
-+}
-+
-+INLINE __m128 swizzleLane0(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
-+}
-+
-+INLINE __m128 swizzleLane1(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
-+}
-+
-+INLINE __m128 swizzleLane2(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
-+}
-+
-+INLINE __m128 swizzleLane3(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
-+}
-+
-+INLINE __m128 swizzleLane4(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
-+
-+}
-+
-+INLINE __m128 swizzleLane5(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
-+}
-+
-+INLINE __m128 swizzleLane6(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
-+}
-+
-+INLINE __m128 swizzleLane7(const simdvector &a)
-+{
-+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
-+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
-+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
-+}
-+
-+INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
-+{
-+    switch (lane) {
-+    case 0:
-+        return swizzleLane0(a);
-+    case 1:
-+        return swizzleLane1(a);
-+    case 2:
-+        return swizzleLane2(a);
-+    case 3:
-+        return swizzleLane3(a);
-+    case 4:
-+        return swizzleLane4(a);
-+    case 5:
-+        return swizzleLane5(a);
-+    case 6:
-+        return swizzleLane6(a);
-+    case 7:
-+        return swizzleLane7(a);
-+    default:
-+        return _mm_setzero_ps();
-+    }
-+}
-+
-+// Cut-aware primitive assembler.
-+struct PA_STATE_CUT : public PA_STATE
-+{
-+    simdmask* pCutIndices;          // cut indices buffer, 1 bit per vertex
-+    uint32_t numVerts;              // number of vertices available in buffer store
-+    uint32_t numAttribs;            // number of attributes
-+    int32_t numRemainingVerts;      // number of verts remaining to be assembled
-+    uint32_t numVertsToAssemble;    // total number of verts to assemble for the draw
-+    OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
-+    simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
-+    uint32_t numPrimsAssembled;     // number of primitives that are fully assembled
-+    uint32_t headVertex;            // current unused vertex slot in vertex buffer store
-+    uint32_t tailVertex;            // beginning vertex currently assembling
-+    uint32_t curVertex;             // current unprocessed vertex
-+    uint32_t startPrimId;           // starting prim id
-+    simdscalari vPrimId;            // vector of prim ID
-+    bool needOffsets;               // need to compute gather offsets for current SIMD
-+    uint32_t vertsPerPrim;
-+    simdvertex tmpVertex;               // temporary simdvertex for unimplemented API
-+    bool processCutVerts;           // vertex indices with cuts should be processed as normal, otherwise they
-+                                    // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
-+                                    // while the GS sends valid verts for every index 
-+    // Topology state tracking
-+    uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
-+    uint32_t curIndex;
-+    bool reverseWinding;            // indicates reverse winding for strips
-+    int32_t adjExtraVert;           // extra vert uses for tristrip w/ adj
-+
-+    typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
-+    PFN_PA_FUNC pfnPa;              // per-topology function that processes a single vert
-+
-+    PA_STATE_CUT() {}
-+    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, 
-+        uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
-+        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
-+    {
-+        numVerts = in_streamSizeInVerts;
-+        numAttribs = in_numAttribs;
-+        binTopology = topo;
-+        needOffsets = false;
-+        processCutVerts = in_processCutVerts;
-+
-+        numVertsToAssemble = numRemainingVerts = in_numVerts;
-+        numPrimsAssembled = 0;
-+        headVertex = tailVertex = curVertex = 0;
-+
-+        curIndex = 0;
-+        pCutIndices = in_pIndices;
-+        memset(indices, 0, sizeof(indices));
-+        vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-+        reverseWinding = false;
-+        adjExtraVert = -1;
-+
-+        bool gsEnabled = pDC->pState->state.gsState.gsEnable;
-+        vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
-+
-+        switch (topo)
-+        {
-+        case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
-+        case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
-+        case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
-+        case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
-+                                    {
-+                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
-+                                    }
-+                                    else
-+                                    {
-+                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
-+                                    }
-+                                    break;
-+
-+        case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
-+        case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
-+        case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
-+        case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
-+        case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
-+        default: assert(0 && "Unimplemented topology");
-+        }
-+    }
-+
-+    simdvertex& GetNextVsOutput()
-+    {
-+        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
-+        this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
-+        this->needOffsets = true;
-+        return ((simdvertex*)pStreamBase)[vertexIndex];
-+    }
-+
-+    simdmask& GetNextVsIndices()
-+    {
-+        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
-+        simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
-+        return *pCurCutIndex;
-+    }
-+
-+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
-+    {
-+        // unused
-+        SWR_ASSERT(0 && "Not implemented");
-+        return this->tmpVertex.attrib[0];
-+    }
-+
-+    bool GetNextStreamOutput()
-+    {
-+        this->headVertex += KNOB_SIMD_WIDTH;
-+        this->needOffsets = true;
-+        return HasWork();
-+    }
-+
-+    simdscalari GetPrimID(uint32_t startID)
-+    {
-+        return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
-+    }
-+
-+    void Reset()
-+    {
-+        this->numRemainingVerts = this->numVertsToAssemble;
-+        this->numPrimsAssembled = 0;
-+        this->curIndex = 0;
-+        this->curVertex = 0;
-+        this->tailVertex = 0;
-+        this->headVertex = 0;
-+        this->reverseWinding = false;
-+        this->adjExtraVert = -1;
-+        this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-+    }
-+
-+    bool HasWork()
-+    {
-+        return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
-+    }
-+
-+    bool IsVertexStoreFull()
-+    {
-+        return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
-+    }
-+
-+    void RestartTopology()
-+    {
-+        this->curIndex = 0;
-+        this->reverseWinding = false;
-+        this->adjExtraVert = -1;
-+    }
-+
-+    bool IsCutIndex(uint32_t vertex)
-+    {
-+        uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
-+        uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
-+        return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
-+    }
-+
-+    // iterates across the unprocessed verts until we hit the end or we 
-+    // have assembled SIMD prims
-+    void ProcessVerts()
-+    {
-+        while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
-+            this->numRemainingVerts > 0 &&
-+            this->curVertex != this->headVertex)
-+        {
-+            // if cut index, restart topology 
-+            if (IsCutIndex(this->curVertex))
-+            {
-+                if (this->processCutVerts)
-+                {
-+                    (this->*pfnPa)(this->curVertex, false);
-+                }
-+                // finish off tri strip w/ adj before restarting topo
-+                if (this->adjExtraVert != -1)
-+                {
-+                    (this->*pfnPa)(this->curVertex, true);
-+                }
-+                RestartTopology();
-+            }
-+            else
-+            {
-+                (this->*pfnPa)(this->curVertex, false);
-+            }
-+
-+            this->curVertex = (this->curVertex + 1) % this->numVerts;
-+            this->numRemainingVerts--;
-+        }
-+
-+        // special case last primitive for tri strip w/ adj
-+        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
-+        {
-+            (this->*pfnPa)(this->curVertex, true);
-+        }
-+    }
-+
-+    void Advance()
-+    {
-+        // done with current batch
-+        // advance tail to the current unsubmitted vertex
-+        this->tailVertex = this->curVertex;
-+        this->numPrimsAssembled = 0;
-+        this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
-+    }
-+
-+    bool NextPrim()
-+    {
-+        // if we've assembled enough prims, we can advance to the next set of verts
-+        if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
-+        {
-+            Advance();
-+        }
-+        return false;
-+    }
-+
-+    void ComputeOffsets()
-+    {
-+        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
-+        {
-+            simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
-+
-+            // step to simdvertex batch
-+            const uint32_t simdShift = 3; // @todo make knob
-+            simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
-+            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
-+
-+            // step to index
-+            const uint32_t simdMask = 0x7; // @todo make knob
-+            simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
-+            this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
-+        }
-+    }
-+
-+    bool Assemble(uint32_t slot, simdvector result[])
-+    {
-+        // process any outstanding verts
-+        ProcessVerts();
-+
-+        // return false if we don't have enough prims assembled
-+        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
-+        {
-+            return false;
-+        }
-+
-+        // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
-+        if (this->needOffsets)
-+        {
-+            ComputeOffsets();
-+            this->needOffsets = false;
-+        }
-+
-+        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
-+        {
-+            simdscalari offsets = this->vOffsets[v];
-+
-+            // step to attribute
-+            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
-+
-+            float* pBase = (float*)this->pStreamBase;
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
-+
-+                // move base to next component
-+                pBase += KNOB_SIMD_WIDTH;
-+            }
-+        }
-+
-+        return true;
-+    }
-+
-+    void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
-+    {
-+        // move to slot
-+        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
-+        {
-+            uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
-+            uint32_t offset = pOffset[triIndex];
-+            offset += sizeof(simdvector) * slot;
-+            float* pVert = (float*)&tri[v];
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                float* pComponent = (float*)(this->pStreamBase + offset);
-+                pVert[c] = *pComponent;
-+                offset += KNOB_SIMD_WIDTH * sizeof(float);
-+            }
-+        }
-+    }
-+
-+    uint32_t NumPrims()
-+    {
-+        return this->numPrimsAssembled;
-+    }
-+
-+    // Per-topology functions
-+    void ProcessVertTriStrip(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 3)
-+        {
-+            // assembled enough verts for prim, add to gather indices
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            if (reverseWinding)
-+            {
-+                this->indices[1][this->numPrimsAssembled] = this->vert[2];
-+                this->indices[2][this->numPrimsAssembled] = this->vert[1];
-+            }
-+            else
-+            {
-+                this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+                this->indices[2][this->numPrimsAssembled] = this->vert[2];
-+            }
-+
-+            // increment numPrimsAssembled
-+            this->numPrimsAssembled++;
-+
-+            // set up next prim state
-+            this->vert[0] = this->vert[1];
-+            this->vert[1] = this->vert[2];
-+            this->curIndex = 2;
-+            this->reverseWinding ^= 1;
-+        }
-+    }
-+
-+    template<bool gsEnabled>
-+    void AssembleTriStripAdj()
-+    {
-+        if (!gsEnabled)
-+        {
-+            this->vert[1] = this->vert[2];
-+            this->vert[2] = this->vert[4];
-+
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-+
-+            this->vert[4] = this->vert[2];
-+            this->vert[2] = this->vert[1];
-+        }
-+        else
-+        {
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-+            this->indices[4][this->numPrimsAssembled] = this->vert[4];
-+            this->indices[5][this->numPrimsAssembled] = this->vert[5];
-+        }
-+        this->numPrimsAssembled++;
-+    }
-+
-+
-+    template<bool gsEnabled>
-+    void ProcessVertTriStripAdj(uint32_t index, bool finish)
-+    {
-+        // handle last primitive of tristrip
-+        if (finish && this->adjExtraVert != -1)
-+        {
-+            this->vert[3] = this->adjExtraVert;
-+            AssembleTriStripAdj<gsEnabled>();
-+            this->adjExtraVert = -1;
-+            return;
-+        }
-+
-+        switch (this->curIndex)
-+        {
-+        case 0:
-+        case 1:
-+        case 2:
-+        case 4:
-+            this->vert[this->curIndex] = index;
-+            this->curIndex++;
-+            break;
-+        case 3:
-+            this->vert[5] = index;
-+            this->curIndex++;
-+            break;
-+        case 5:
-+            if (this->adjExtraVert == -1)
-+            {
-+                this->adjExtraVert = index;
-+            }
-+            else
-+            {
-+                this->vert[3] = index;
-+                if (!gsEnabled)
-+                {
-+                    AssembleTriStripAdj<gsEnabled>();
-+
-+                    uint32_t nextTri[6];
-+                    if (this->reverseWinding)
-+                    {
-+                        nextTri[0] = this->vert[4];
-+                        nextTri[1] = this->vert[0];
-+                        nextTri[2] = this->vert[2];
-+                        nextTri[4] = this->vert[3];
-+                        nextTri[5] = this->adjExtraVert;
-+                    }
-+                    else
-+                    {
-+                        nextTri[0] = this->vert[2];
-+                        nextTri[1] = this->adjExtraVert;
-+                        nextTri[2] = this->vert[3];
-+                        nextTri[4] = this->vert[4];
-+                        nextTri[5] = this->vert[0];
-+                    }
-+                    for (uint32_t i = 0; i < 6; ++i)
-+                    {
-+                        this->vert[i] = nextTri[i];
-+                    }
-+
-+                    this->adjExtraVert = -1;
-+                    this->reverseWinding ^= 1;
-+                }
-+                else
-+                {
-+                    this->curIndex++;
-+                }
-+            }
-+            break;
-+        case 6:
-+            SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
-+            AssembleTriStripAdj<gsEnabled>();
-+            
-+            uint32_t nextTri[6];
-+            if (this->reverseWinding)
-+            {
-+                nextTri[0] = this->vert[4];
-+                nextTri[1] = this->vert[0];
-+                nextTri[2] = this->vert[2];
-+                nextTri[4] = this->vert[3];
-+                nextTri[5] = this->adjExtraVert;
-+            }
-+            else
-+            {
-+                nextTri[0] = this->vert[2];
-+                nextTri[1] = this->adjExtraVert;
-+                nextTri[2] = this->vert[3];
-+                nextTri[4] = this->vert[4];
-+                nextTri[5] = this->vert[0]; 
-+            }
-+            for (uint32_t i = 0; i < 6; ++i)
-+            {
-+                this->vert[i] = nextTri[i];
-+            }
-+            this->reverseWinding ^= 1;
-+            this->adjExtraVert = index;
-+            this->curIndex--;
-+            break;
-+        }
-+    }
-+
-+    void ProcessVertTriList(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 3)
-+        {
-+            // assembled enough verts for prim, add to gather indices
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-+
-+            // increment numPrimsAssembled
-+            this->numPrimsAssembled++;
-+
-+            // set up next prim state
-+            this->curIndex = 0;
-+        }
-+    }
-+
-+    void ProcessVertTriListAdj(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 6)
-+        {
-+            // assembled enough verts for prim, add to gather indices
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-+            this->indices[4][this->numPrimsAssembled] = this->vert[4];
-+            this->indices[5][this->numPrimsAssembled] = this->vert[5];
-+
-+            // increment numPrimsAssembled
-+            this->numPrimsAssembled++;
-+
-+            // set up next prim state
-+            this->curIndex = 0;
-+        }
-+    }
-+
-+    void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 6)
-+        {
-+            // assembled enough verts for prim, add to gather indices
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[2];
-+            this->indices[2][this->numPrimsAssembled] = this->vert[4];
-+
-+            // increment numPrimsAssembled
-+            this->numPrimsAssembled++;
-+
-+            // set up next prim state
-+            this->curIndex = 0;
-+        }
-+    }
-+
-+
-+    void ProcessVertLineList(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 2)
-+        {
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+
-+            this->numPrimsAssembled++;
-+            this->curIndex = 0;
-+        }
-+    }
-+
-+    void ProcessVertLineStrip(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 2)
-+        {
-+            // assembled enough verts for prim, add to gather indices
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+
-+            // increment numPrimsAssembled
-+            this->numPrimsAssembled++;
-+
-+            // set up next prim state
-+            this->vert[0] = this->vert[1];
-+            this->curIndex = 1;
-+        }
-+    }
-+
-+    void ProcessVertLineStripAdj(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 4)
-+        {
-+            // assembled enough verts for prim, add to gather indices
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-+
-+            // increment numPrimsAssembled
-+            this->numPrimsAssembled++;
-+
-+            // set up next prim state
-+            this->vert[0] = this->vert[1];
-+            this->vert[1] = this->vert[2];
-+            this->vert[2] = this->vert[3];
-+            this->curIndex = 3;
-+        }
-+    }
-+
-+    void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 4)
-+        {
-+            // assembled enough verts for prim, add to gather indices
-+            this->indices[0][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[2];
-+
-+            // increment numPrimsAssembled
-+            this->numPrimsAssembled++;
-+
-+            // set up next prim state
-+            this->vert[0] = this->vert[1];
-+            this->vert[1] = this->vert[2];
-+            this->vert[2] = this->vert[3];
-+            this->curIndex = 3;
-+        }
-+    }
-+
-+    void ProcessVertLineListAdj(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 4)
-+        {
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
-+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
-+
-+            this->numPrimsAssembled++;
-+            this->curIndex = 0;
-+        }
-+    }
-+
-+    void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 4)
-+        {
-+            this->indices[0][this->numPrimsAssembled] = this->vert[1];
-+            this->indices[1][this->numPrimsAssembled] = this->vert[2];
-+
-+            this->numPrimsAssembled++;
-+            this->curIndex = 0;
-+        }
-+    }
-+
-+    void ProcessVertPointList(uint32_t index, bool finish)
-+    {
-+        this->vert[this->curIndex] = index;
-+        this->curIndex++;
-+        if (this->curIndex == 1)
-+        {
-+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
-+            this->numPrimsAssembled++;
-+            this->curIndex = 0;
-+        }
-+    }
-+};
-+
-+// Primitive Assembly for data output from the DomainShader.
-+struct PA_TESS : PA_STATE
-+{
-+    PA_TESS(
-+        DRAW_CONTEXT *in_pDC,
-+        const simdscalar* in_pVertData,
-+        uint32_t in_attributeStrideInVectors,
-+        uint32_t in_numAttributes,
-+        uint32_t* (&in_ppIndices)[3],
-+        uint32_t in_numPrims,
-+        PRIMITIVE_TOPOLOGY in_binTopology) :
-+
-+        PA_STATE(in_pDC, nullptr, 0),
-+        m_pVertexData(in_pVertData),
-+        m_attributeStrideInVectors(in_attributeStrideInVectors),
-+        m_numAttributes(in_numAttributes),
-+        m_numPrims(in_numPrims)
-+    {
-+        m_vPrimId = _simd_setzero_si();
-+        binTopology = in_binTopology;
-+        m_ppIndices[0] = in_ppIndices[0];
-+        m_ppIndices[1] = in_ppIndices[1];
-+        m_ppIndices[2] = in_ppIndices[2];
-+
-+        switch (binTopology)
-+        {
-+        case TOP_POINT_LIST:
-+            m_numVertsPerPrim = 1;
-+            break;
-+
-+        case TOP_LINE_LIST:
-+            m_numVertsPerPrim = 2;
-+            break;
-+
-+        case TOP_TRIANGLE_LIST:
-+            m_numVertsPerPrim = 3;
-+            break;
-+
-+        default:
-+            SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
-+            break;
-+        }
-+    }
-+
-+    bool HasWork()
-+    {
-+        return m_numPrims != 0;
-+    }
-+
-+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
-+    {
-+        SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
-+        static simdvector junk = { 0 };
-+        return junk;
-+    }
-+
-+    static simdscalari GenPrimMask(uint32_t numPrims)
-+    {
-+        SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
-+#if KNOB_SIMD_WIDTH == 8
-+        static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
-+        {
-+            -1, -1, -1, -1, -1, -1, -1, -1,
-+             0,  0,  0,  0,  0,  0,  0,  0
-+        };
-+#elif KNOB_SIMD_WIDTH == 16
-+        static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
-+        {
-+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-+        };
-+#else
-+#error "Help, help, I can't get up!"
-+#endif
-+
-+        return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
-+    }
-+
-+    bool Assemble(uint32_t slot, simdvector verts[])
-+    {
-+        static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
-+        SWR_ASSERT(slot < m_numAttributes);
-+
-+        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
-+        if (0 == numPrimsToAssemble)
-+        {
-+            return false;
-+        }
-+
-+        simdscalari mask = GenPrimMask(numPrimsToAssemble);
-+
-+        const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
-+        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
-+        {
-+            simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
-+
-+            const float* pBase = pBaseAttrib;
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                verts[i].v[c] = _simd_mask_i32gather_ps(
-+                    _simd_setzero_ps(),
-+                    pBase,
-+                    indices,
-+                    _simd_castsi_ps(mask),
-+                    4 /* gcc doesn't like sizeof(float) */);
-+                pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
-+            }
-+        }
-+
-+        return true;
-+    }
-+
-+    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
-+    {
-+        SWR_ASSERT(slot < m_numAttributes);
-+        SWR_ASSERT(primIndex < PA_TESS::NumPrims());
-+
-+        const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
-+        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
-+        {
-+            uint32_t index = m_ppIndices[i][primIndex];
-+            const float* pVertData = pVertDataBase;
-+            float* pVert = (float*)&verts[i];
-+
-+            for (uint32_t c = 0; c < 4; ++c)
-+            {
-+                pVert[c] = pVertData[index];
-+                pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
-+            }
-+        }
-+    }
-+
-+    bool NextPrim()
-+    {
-+        uint32_t numPrims = PA_TESS::NumPrims();
-+        m_numPrims -= numPrims;
-+        m_ppIndices[0] += numPrims;
-+        m_ppIndices[1] += numPrims;
-+        m_ppIndices[2] += numPrims;
-+
-+        return HasWork();
-+    }
-+
-+    simdvertex& GetNextVsOutput()
-+    {
-+        SWR_ASSERT(0, "%s", __FUNCTION__);
-+        static simdvertex junk;
-+        return junk;
-+    }
-+
-+    bool GetNextStreamOutput()
-+    {
-+        SWR_ASSERT(0, "%s", __FUNCTION__);
-+        return false;
-+    }
-+
-+    simdmask& GetNextVsIndices()
-+    {
-+        SWR_ASSERT(0, "%s", __FUNCTION__);
-+        static simdmask junk;
-+        return junk;
-+    }
-+
-+    uint32_t NumPrims()
-+    {
-+        return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
-+    }
-+
-+    void Reset() { SWR_ASSERT(0); };
-+
-+    simdscalari GetPrimID(uint32_t startID)
-+    {
-+        return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
-+    }
-+
-+private:
-+    const simdscalar*   m_pVertexData = nullptr;
-+    uint32_t            m_attributeStrideInVectors = 0;
-+    uint32_t            m_numAttributes = 0;
-+    uint32_t            m_numPrims = 0;
-+    uint32_t*           m_ppIndices[3];
-+
-+    uint32_t            m_numVertsPerPrim = 0;
-+
-+    simdscalari         m_vPrimId;
-+};
-+
-+// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
-+// based on state.
-+struct PA_FACTORY
-+{
-+    PA_FACTORY(DRAW_CONTEXT* pDC, bool isIndexed, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
-+    {
-+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
-+        const API_STATE& state = GetApiState(pDC);
-+        if ((isIndexed && (
-+            topo == TOP_TRIANGLE_STRIP ||
-+            (topo == TOP_POINT_LIST && CanUseSimplePoints(pDC)) ||
-+            topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
-+            topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ ||
-+            topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
-+            topo == TOP_TRI_STRIP_ADJ)) ||
-+
-+            // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
-+            // for them in the optimized PA
-+            (!isIndexed && (
-+            topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)))
-+        {
-+            DWORD numAttribs;
-+            _BitScanReverse(&numAttribs, state.feAttribMask);
-+            numAttribs++;
-+            this->paCut = PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, 
-+                &this->indexStore[0], numVerts, numAttribs, state.topology, false);
-+            cutPA = true;
-+        }
-+        else
-+#endif
-+        {
-+            uint32_t numPrims = GetNumPrims(in_topo, numVerts);
-+            this->paOpt = PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
-+            cutPA = false;
-+        }
-+
-+    }
-+
-+    PA_STATE& GetPA()
-+    {
-+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
-+        if (cutPA)
-+        {
-+            return this->paCut;
-+        }
-+        else
-+#endif
-+        {
-+            return this->paOpt;
-+        }
-+    }
-+
-+    PA_STATE_OPT paOpt;
-+    PA_STATE_CUT paCut;
-+    bool cutPA;
-+
-+    PRIMITIVE_TOPOLOGY topo;
-+
-+    simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
-+    simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
-new file mode 100644
-index 0000000..6dce0bb
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
-@@ -0,0 +1,1330 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file pa_avx.cpp
-+*
-+* @brief AVX implementation for primitive assembly.
-+*        N primitives are assembled at a time, where N is the SIMD width.
-+*        A state machine, that is specific for a given topology, drives the
-+*        assembly of vertices into triangles.
-+*
-+******************************************************************************/
-+#include "context.h"
-+#include "pa.h"
-+#include "frontend.h"
-+
-+#if (KNOB_SIMD_WIDTH == 8)
-+
-+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+
-+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+
-+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+
-+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+
-+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+
-+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]);
-+
-+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]);
-+
-+bool PaTriPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaTriPoints1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaTriPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+void PaTriPointsSingle1(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); 
-+
-+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+
-+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
-+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
-+
-+template <uint32_t TotalControlPoints>
-+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
-+    // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
-+    // Each attribute has 4 components.
-+
-+    /// @todo Optimize this
-+
-+    float* pOutVec = (float*)verts;
-+
-+    for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
-+    {
-+        uint32_t input_cp = primIndex * TotalControlPoints + cp;
-+        uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
-+        uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
-+
-+        // Loop over all components of the attribute
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
-+            pOutVec[cp * 4 + i] = pInputVec[input_lane];
-+        }
-+    }
-+}
-+
-+template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
-+static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(
-+        pa,
-+        PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
-+        PaPatchListSingle<TotalControlPoints>);
-+
-+    return false;
-+}
-+
-+template<uint32_t TotalControlPoints>
-+static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
-+    // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
-+    // Each attribute has 4 components.
-+
-+    /// @todo Optimize this
-+
-+    // Loop over all components of the attribute
-+    for (uint32_t i = 0; i < 4; ++i)
-+    {
-+        for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
-+        {
-+            float vec[KNOB_SIMD_WIDTH];
-+            for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
-+            {
-+                uint32_t input_cp = lane * TotalControlPoints + cp;
-+                uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
-+                uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
-+
-+                const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
-+                vec[lane] = pInputVec[input_lane];
-+            }
-+            verts[cp][i] = _simd_loadu_ps(vec);
-+        }
-+    }
-+
-+    SetNextPaState(
-+        pa,
-+        PaPatchList<TotalControlPoints>,
-+        PaPatchListSingle<TotalControlPoints>,
-+        0,
-+        KNOB_SIMD_WIDTH,
-+        true);
-+
-+    return true;
-+}
-+
-+#define PA_PATCH_LIST_TERMINATOR(N) \
-+    template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
-+                           { return PaPatchListTerm<N>(pa, slot, verts); }
-+PA_PATCH_LIST_TERMINATOR(1)
-+PA_PATCH_LIST_TERMINATOR(2)
-+PA_PATCH_LIST_TERMINATOR(3)
-+PA_PATCH_LIST_TERMINATOR(4)
-+PA_PATCH_LIST_TERMINATOR(5)
-+PA_PATCH_LIST_TERMINATOR(6)
-+PA_PATCH_LIST_TERMINATOR(7)
-+PA_PATCH_LIST_TERMINATOR(8)
-+PA_PATCH_LIST_TERMINATOR(9)
-+PA_PATCH_LIST_TERMINATOR(10)
-+PA_PATCH_LIST_TERMINATOR(11)
-+PA_PATCH_LIST_TERMINATOR(12)
-+PA_PATCH_LIST_TERMINATOR(13)
-+PA_PATCH_LIST_TERMINATOR(14)
-+PA_PATCH_LIST_TERMINATOR(15)
-+PA_PATCH_LIST_TERMINATOR(16)
-+PA_PATCH_LIST_TERMINATOR(17)
-+PA_PATCH_LIST_TERMINATOR(18)
-+PA_PATCH_LIST_TERMINATOR(19)
-+PA_PATCH_LIST_TERMINATOR(20)
-+PA_PATCH_LIST_TERMINATOR(21)
-+PA_PATCH_LIST_TERMINATOR(22)
-+PA_PATCH_LIST_TERMINATOR(23)
-+PA_PATCH_LIST_TERMINATOR(24)
-+PA_PATCH_LIST_TERMINATOR(25)
-+PA_PATCH_LIST_TERMINATOR(26)
-+PA_PATCH_LIST_TERMINATOR(27)
-+PA_PATCH_LIST_TERMINATOR(28)
-+PA_PATCH_LIST_TERMINATOR(29)
-+PA_PATCH_LIST_TERMINATOR(30)
-+PA_PATCH_LIST_TERMINATOR(31)
-+PA_PATCH_LIST_TERMINATOR(32)
-+#undef PA_PATCH_LIST_TERMINATOR
-+
-+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaTriList1, PaTriListSingle0);
-+    return false;    // Not enough vertices to assemble 4 or 8 triangles.
-+}
-+
-+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaTriList2, PaTriListSingle0);
-+    return false;    // Not enough vertices to assemble 8 triangles.
-+}
-+
-+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, 0, slot);
-+    simdvector& b = PaGetSimdVector(pa, 1, slot);
-+    simdvector& c = PaGetSimdVector(pa, 2, slot);
-+    simdscalar    s;
-+
-+    // Tri Pattern - provoking vertex is always v0
-+    //  v0 -> 0 3 6 9  12 15 18 21
-+    //  v1 -> 1 4 7 10 13 16 19 22
-+    //  v2 -> 2 5 8 11 14 17 20 23
-+
-+    for(int i = 0; i < 4; ++i)
-+    {
-+        simdvector& v0 = verts[0];
-+        v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
-+        v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
-+        v0[i] = _mm256_permute_ps(v0[i], 0x6C);
-+        s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
-+        v0[i] = _simd_blend_ps(v0[i], s, 0x44);
-+
-+        simdvector& v1 = verts[1];
-+        v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
-+        v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
-+        v1[i] = _mm256_permute_ps(v1[i], 0xB1);
-+        s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
-+        v1[i] = _simd_blend_ps(v1[i], s, 0x66);
-+
-+        simdvector& v2 = verts[2];
-+        v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
-+        v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
-+        v2[i] = _mm256_permute_ps(v2[i], 0xC6);
-+        s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
-+        v2[i] = _simd_blend_ps(v2[i], s, 0x22);
-+    }
-+
-+    SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
-+    return true;
-+}
-+
-+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    // We have 12 simdscalars contained within 3 simdvectors which
-+    // hold at least 8 triangles worth of data. We want to assemble a single
-+    // triangle with data in horizontal form.
-+    simdvector& a = PaGetSimdVector(pa, 0, slot);
-+    simdvector& b = PaGetSimdVector(pa, 1, slot);
-+    simdvector& c = PaGetSimdVector(pa, 2, slot);
-+
-+    // Convert from vertical to horizontal.
-+    // Tri Pattern - provoking vertex is always v0
-+    //  v0 -> 0 3 6 9  12 15 18 21
-+    //  v1 -> 1 4 7 10 13 16 19 22
-+    //  v2 -> 2 5 8 11 14 17 20 23
-+    switch(primIndex)
-+    {
-+    case 0:
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane1(a);
-+        verts[2] = swizzleLane2(a);
-+        break;
-+    case 1:
-+        verts[0] = swizzleLane3(a);
-+        verts[1] = swizzleLane4(a);
-+        verts[2] = swizzleLane5(a);
-+        break;
-+    case 2:
-+        verts[0] = swizzleLane6(a);
-+        verts[1] = swizzleLane7(a);
-+        verts[2] = swizzleLane0(b);
-+        break;
-+    case 3:
-+        verts[0] = swizzleLane1(b);
-+        verts[1] = swizzleLane2(b);
-+        verts[2] = swizzleLane3(b);
-+        break;
-+    case 4:
-+        verts[0] = swizzleLane4(b);
-+        verts[1] = swizzleLane5(b);
-+        verts[2] = swizzleLane6(b);
-+        break;
-+    case 5:
-+        verts[0] = swizzleLane7(b);
-+        verts[1] = swizzleLane0(c);
-+        verts[2] = swizzleLane1(c);
-+        break;
-+    case 6:
-+        verts[0] = swizzleLane2(c);
-+        verts[1] = swizzleLane3(c);
-+        verts[2] = swizzleLane4(c);
-+        break;
-+    case 7:
-+        verts[0] = swizzleLane5(c);
-+        verts[1] = swizzleLane6(c);
-+        verts[2] = swizzleLane7(c);
-+        break;
-+    };
-+}
-+
-+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
-+    return false;    // Not enough vertices to assemble 8 triangles.
-+}
-+
-+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-+    simdscalar  s;
-+
-+    for(int i = 0; i < 4; ++i)
-+    {
-+        simdscalar a0 = a[i];
-+        simdscalar b0 = b[i];
-+
-+        // Tri Pattern - provoking vertex is always v0
-+        //  v0 -> 01234567
-+        //  v1 -> 13355779
-+        //  v2 -> 22446688
-+        simdvector& v0 = verts[0];
-+        v0[i] = a0;
-+
-+        //  s -> 4567891011 
-+        s = _mm256_permute2f128_ps(a0, b0, 0x21);
-+        //  s -> 23456789
-+        s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
-+
-+        simdvector& v1 = verts[1];
-+        //  v1 -> 13355779
-+        v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
-+
-+        simdvector& v2 = verts[2];
-+        //  v2 -> 22446688
-+        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
-+    }
-+
-+    SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
-+    return true;
-+}
-+
-+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    // Convert from vertical to horizontal.
-+    // Tri Pattern - provoking vertex is always v0
-+    //  v0 -> 01234567
-+    //  v1 -> 13355779
-+    //  v2 -> 22446688
-+    switch(primIndex)
-+    {
-+    case 0:
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane1(a);
-+        verts[2] = swizzleLane2(a);
-+        break;
-+    case 1:
-+        verts[0] = swizzleLane1(a);
-+        verts[1] = swizzleLane3(a);
-+        verts[2] = swizzleLane2(a);
-+        break;
-+    case 2:
-+        verts[0] = swizzleLane2(a);
-+        verts[1] = swizzleLane3(a);
-+        verts[2] = swizzleLane4(a);
-+        break;
-+    case 3:
-+        verts[0] = swizzleLane3(a);
-+        verts[1] = swizzleLane5(a);
-+        verts[2] = swizzleLane4(a);
-+        break;
-+    case 4:
-+        verts[0] = swizzleLane4(a);
-+        verts[1] = swizzleLane5(a);
-+        verts[2] = swizzleLane6(a);
-+        break;
-+    case 5:
-+        verts[0] = swizzleLane5(a);
-+        verts[1] = swizzleLane7(a);
-+        verts[2] = swizzleLane6(a);
-+        break;
-+    case 6:
-+        verts[0] = swizzleLane6(a);
-+        verts[1] = swizzleLane7(a);
-+        verts[2] = swizzleLane0(b);
-+        break;
-+    case 7:
-+        verts[0] = swizzleLane7(a);
-+        verts[1] = swizzleLane1(b);
-+        verts[2] = swizzleLane0(b);
-+        break;
-+    };
-+}
-+
-+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    // Extract vertex 0 to every lane of first vector
-+    for(int i = 0; i < 4; ++i)
-+    {
-+        __m256 a0 = a[i];
-+        simdvector& v0 = verts[0];
-+        v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
-+        v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
-+    }
-+
-+    // store off leading vertex for attributes
-+    simdvertex* pVertex = (simdvertex*)pa.pStreamBase;
-+    pa.leadingVertex = pVertex[pa.cur];
-+
-+    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
-+    return false;    // Not enough vertices to assemble 8 triangles.
-+}
-+
-+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& leadVert = pa.leadingVertex.attrib[slot];
-+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-+    simdscalar    s;
-+
-+    // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
-+    for(int i = 0; i < 4; ++i)
-+    {
-+        simdscalar a0 = a[i];
-+        simdscalar b0 = b[i];
-+
-+        __m256 comp = leadVert[i];
-+        simdvector& v0 = verts[0];
-+        v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
-+        v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
-+
-+        simdvector& v2 = verts[2];
-+        s = _mm256_permute2f128_ps(a0, b0, 0x21);
-+        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
-+
-+        simdvector& v1 = verts[1];
-+        v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
-+    }
-+
-+    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
-+    return true;
-+}
-+
-+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    // vert 0 from leading vertex
-+    simdvector& lead = pa.leadingVertex.attrib[slot];
-+    verts[0] = swizzleLane0(lead);
-+
-+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    // vert 1
-+    if (primIndex < 7)
-+    {
-+        verts[1] = swizzleLaneN(a, primIndex + 1);
-+    }
-+    else
-+    {
-+        verts[1] = swizzleLane0(b);
-+    }
-+
-+    // vert 2
-+    if (primIndex < 6)
-+    {
-+        verts[2] = swizzleLaneN(a, primIndex + 2);
-+    }
-+    else
-+    {
-+        verts[2] = swizzleLaneN(b, primIndex - 6);
-+    }
-+}
-+
-+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
-+    return false;    // Not enough vertices to assemble 8 triangles.
-+}
-+
-+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, 0, slot);
-+    simdvector& b = PaGetSimdVector(pa, 1, slot);
-+    simdscalar    s1, s2;
-+
-+    for(int i = 0; i < 4; ++i)
-+    {
-+        simdscalar a0 = a[i];
-+        simdscalar b0 = b[i];
-+
-+        s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
-+        s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
-+
-+        simdvector& v0 = verts[0];
-+        v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
-+
-+        simdvector& v1 = verts[1];
-+        v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
-+
-+        simdvector& v2 = verts[2];
-+        v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
-+    }
-+
-+    SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true);
-+    return true;
-+}
-+
-+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, 0, slot);
-+    simdvector& b = PaGetSimdVector(pa, 1, slot);
-+
-+    switch (primIndex)
-+    {
-+    case 0:
-+        // triangle 0 - 0 1 2
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane1(a);
-+        verts[2] = swizzleLane2(a);
-+        break;
-+
-+    case 1:
-+        // triangle 1 - 0 2 3
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane2(a);
-+        verts[2] = swizzleLane3(a);
-+        break;
-+
-+    case 2:
-+        // triangle 2 - 4 5 6
-+        verts[0] = swizzleLane4(a);
-+        verts[1] = swizzleLane5(a);
-+        verts[2] = swizzleLane6(a);
-+        break;
-+
-+    case 3:
-+        // triangle 3 - 4 6 7
-+        verts[0] = swizzleLane4(a);
-+        verts[1] = swizzleLane6(a);
-+        verts[2] = swizzleLane7(a);
-+        break;
-+
-+    case 4:
-+        // triangle 4 - 8 9 10 (0 1 2)
-+        verts[0] = swizzleLane0(b);
-+        verts[1] = swizzleLane1(b);
-+        verts[2] = swizzleLane2(b);
-+        break;
-+
-+    case 5:
-+        // triangle 1 - 0 2 3
-+        verts[0] = swizzleLane0(b);
-+        verts[1] = swizzleLane2(b);
-+        verts[2] = swizzleLane3(b);
-+        break;
-+
-+    case 6:
-+        // triangle 2 - 4 5 6
-+        verts[0] = swizzleLane4(b);
-+        verts[1] = swizzleLane5(b);
-+        verts[2] = swizzleLane6(b);
-+        break;
-+
-+    case 7:
-+        // triangle 3 - 4 6 7
-+        verts[0] = swizzleLane4(b);
-+        verts[1] = swizzleLane6(b);
-+        verts[2] = swizzleLane7(b);
-+        break;
-+    }
-+}
-+
-+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
-+{
-+    PaLineStripSingle0(pa, slot, lineIndex, verts);
-+
-+    if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) {
-+        simdvector &start = PaGetSimdVector(pa, pa.first, slot);
-+        verts[1] = swizzleLane0(start);
-+    }
-+}
-+
-+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
-+    return false;
-+}
-+
-+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    PaLineStrip1(pa, slot, verts);
-+
-+    if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) {
-+        // loop reconnect now
-+        int lane = pa.numPrims - pa.numPrimsComplete - 1;
-+        simdvector &start = PaGetSimdVector(pa, pa.first, slot);
-+        for (int i = 0; i < 4; i++) {
-+            float *startVtx = (float *)&(start[i]);
-+            float *targetVtx = (float *)&(verts[1][i]);
-+            targetVtx[lane] = startVtx[0];
-+        }
-+    }
-+
-+    SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
-+    return true;
-+}
-+
-+
-+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaLineList1, PaLineListSingle0);
-+    return false;    // Not enough vertices to assemble 8 lines
-+}
-+
-+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, 0, slot);
-+    simdvector& b = PaGetSimdVector(pa, 1, slot);
-+    /// @todo: verify provoking vertex is correct
-+    // Line list 0  1  2  3  4  5  6  7
-+    //           8  9 10 11 12 13 14 15
-+
-+    // shuffle:
-+    //           0 2 4 6 8 10 12 14
-+    //           1 3 5 7 9 11 13 15
-+
-+    for (uint32_t i = 0; i < 4; ++i)
-+    {
-+        // 0 1 2 3 8 9 10 11
-+        __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
-+        // 4 5 6 7 12 13 14 15
-+        __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
-+
-+        // 0 2 4 6 8 10 12 14
-+        verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
-+        // 1 3 5 7 9 11 13 15
-+        verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
-+    }
-+
-+    SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true);
-+    return true;
-+}
-+
-+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-+    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    switch (primIndex)
-+    {
-+    case 0:
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane1(a);
-+        break;
-+    case 1:
-+        verts[0] = swizzleLane2(a);
-+        verts[1] = swizzleLane3(a);
-+        break;
-+    case 2:
-+        verts[0] = swizzleLane4(a);
-+        verts[1] = swizzleLane5(a);
-+        break;
-+    case 3:
-+        verts[0] = swizzleLane6(a);
-+        verts[1] = swizzleLane7(a);
-+        break;
-+    case 4:
-+        verts[0] = swizzleLane0(b);
-+        verts[1] = swizzleLane1(b);
-+        break;
-+    case 5:
-+        verts[0] = swizzleLane2(b);
-+        verts[1] = swizzleLane3(b);
-+        break;
-+    case 6:
-+        verts[0] = swizzleLane4(b);
-+        verts[1] = swizzleLane5(b);
-+        break;
-+    case 7:
-+        verts[0] = swizzleLane6(b);
-+        verts[1] = swizzleLane7(b);
-+        break;
-+    }
-+}
-+
-+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
-+    return false;    // Not enough vertices to assemble 8 lines
-+}
-+
-+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    /// @todo: verify provoking vertex is correct
-+    // Line list 0  1  2  3  4  5  6  7
-+    //           8  9 10 11 12 13 14 15
-+
-+    // shuffle:
-+    //           0  1  2  3  4  5  6  7
-+    //           1  2  3  4  5  6  7  8
-+
-+    verts[0] = a;
-+
-+    for(uint32_t i = 0; i < 4; ++i)
-+    {
-+        // 1 2 3 x 5 6 7 x
-+        __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
-+        // 4 5 6 7 8 9 10 11
-+        __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
-+
-+        // x x x 4 x x x 8
-+        __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low  (0 0 0 0)
-+
-+        verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
-+    }
-+
-+    SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH);
-+    return true;
-+}
-+
-+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
-+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    switch (lineIndex)
-+    {
-+    case 0:
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane1(a);
-+        break;
-+    case 1:
-+        verts[0] = swizzleLane1(a);
-+        verts[1] = swizzleLane2(a);
-+        break;
-+    case 2:
-+        verts[0] = swizzleLane2(a);
-+        verts[1] = swizzleLane3(a);
-+        break;
-+    case 3:
-+        verts[0] = swizzleLane3(a);
-+        verts[1] = swizzleLane4(a);
-+        break;
-+    case 4:
-+        verts[0] = swizzleLane4(a);
-+        verts[1] = swizzleLane5(a);
-+        break;
-+    case 5:
-+        verts[0] = swizzleLane5(a);
-+        verts[1] = swizzleLane6(a);
-+        break;
-+    case 6:
-+        verts[0] = swizzleLane6(a);
-+        verts[1] = swizzleLane7(a);
-+        break;
-+    case 7:
-+        verts[0] = swizzleLane7(a);
-+        verts[1] = swizzleLane0(b);
-+        break;
-+    }
-+}
-+
-+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    verts[0] = a;  // points only have 1 vertex.
-+
-+    SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true);
-+    return true;
-+}
-+
-+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
-+    switch(primIndex)
-+    {
-+    case 0: 
-+        verts[0] = swizzleLane0(a);
-+        break;
-+    case 1: 
-+        verts[0] = swizzleLane1(a);
-+        break;
-+    case 2: 
-+        verts[0] = swizzleLane2(a);
-+        break;
-+    case 3: 
-+        verts[0] = swizzleLane3(a);
-+        break;
-+    case 4: 
-+        verts[0] = swizzleLane4(a);
-+        break;
-+    case 5: 
-+        verts[0] = swizzleLane5(a);
-+        break;
-+    case 6: 
-+        verts[0] = swizzleLane6(a);
-+        break;
-+    case 7: 
-+        verts[0] = swizzleLane7(a);
-+        break;
-+    }
-+}
-+
-+// each point generates two tris
-+// primitive assembly broadcasts each point to the 3 vertices of the 2 tris
-+// binner will bloat each point
-+//
-+// input simd : p0 p1 p2 p3 p4 p5 p6 p7 == 8 points, 16 tris
-+// output phase 0:
-+//   verts[0] : p0 p0 p1 p1 p2 p2 p3 p3
-+//   verts[1] : p0 p0 p1 p1 p2 p2 p3 p3
-+//   verts[2] : p0 p0 p1 p1 p2 p2 p3 p3
-+//
-+// output phase 1:
-+//   verts[0] : p4 p4 p5 p5 p6 p6 p7 p7
-+//   verts[1] : p4 p4 p5 p5 p6 p6 p7 p7
-+//   verts[2] : p4 p4 p5 p5 p6 p6 p7 p7
-+
-+
-+// 0 1 2 3 4 5 6 7
-+
-+bool PaTriPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    for (uint32_t i = 0; i < 4; ++i)
-+    {
-+        __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]);                                // 0 0 1 1 4 4 5 5
-+        __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]);                                // 2 2 3 3 6 6 7 7
-+        __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x20);                    // 0 0 1 1 2 2 3 3
-+
-+        verts[0].v[i] = verts[1].v[i] = verts[2].v[i] = vCombined;
-+    }
-+
-+    SetNextPaState(pa, PaTriPoints1, PaTriPointsSingle0, 1, KNOB_SIMD_WIDTH);
-+    return true;
-+}
-+
-+bool PaTriPoints1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    for (uint32_t i = 0; i < 4; ++i)
-+    {
-+        __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]);                                // 0 0 1 1 4 4 5 5
-+        __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]);                                // 2 2 3 3 6 6 7 7
-+        __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x31);                    // 4 4 5 5 6 6 7 7
-+
-+        verts[0].v[i] = verts[1].v[i] = verts[2].v[i] = vCombined;
-+    }
-+
-+    SetNextPaState(pa, PaTriPoints0, PaTriPointsSingle1, 0, KNOB_SIMD_WIDTH);
-+    return true;
-+
-+}
-+
-+static void PaTriPointsSprite(PA_STATE_OPT& pa, uint32_t primIndex, __m128 verts[])
-+{
-+    const API_STATE& state = GetApiState(pa.pDC);
-+
-+    if (!state.rastState.pointSpriteTopOrigin) {
-+        if (primIndex & 1) {
-+            verts[0] = _mm_set_ps(1, 0, 1, 0);
-+            verts[1] = _mm_set_ps(1, 0, 0, 1);
-+            verts[2] = _mm_set_ps(1, 0, 1, 1);
-+        } else {
-+            verts[0] = _mm_set_ps(1, 0, 1, 0);
-+            verts[1] = _mm_set_ps(1, 0, 0, 0);
-+            verts[2] = _mm_set_ps(1, 0, 0, 1);
-+        }
-+    } else {
-+        if (primIndex & 1) {
-+            verts[0] = _mm_set_ps(1, 0, 0, 0);
-+            verts[1] = _mm_set_ps(1, 0, 1, 1);
-+            verts[2] = _mm_set_ps(1, 0, 0, 1);
-+        } else {
-+            verts[0] = _mm_set_ps(1, 0, 0, 0);
-+            verts[1] = _mm_set_ps(1, 0, 1, 0);
-+            verts[2] = _mm_set_ps(1, 0, 1, 1);
-+        }
-+    }
-+}
-+
-+void PaTriPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    const API_STATE& state = GetApiState(pa.pDC);
-+
-+    if (state.rastState.pointSpriteEnable && state.rastState.pointSpriteFESlot == slot) {
-+        return PaTriPointsSprite(pa, primIndex, verts);
-+    }
-+
-+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    switch(primIndex)
-+    {
-+    case 0:
-+    case 1: 
-+        verts[0] = verts[1] = verts[2] = swizzleLane0(a); break;
-+    case 2:
-+    case 3:
-+        verts[0] = verts[1] = verts[2] = swizzleLane1(a); break;
-+    case 4:
-+    case 5:
-+        verts[0] = verts[1] = verts[2] = swizzleLane2(a); break;
-+    case 6:
-+    case 7:
-+        verts[0] = verts[1] = verts[2] = swizzleLane3(a); break;
-+    }
-+}
-+
-+void PaTriPointsSingle1(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
-+{
-+    const API_STATE& state = GetApiState(pa.pDC);
-+
-+    if (state.rastState.pointSpriteEnable && state.rastState.pointSpriteFESlot == slot) {
-+        return PaTriPointsSprite(pa, primIndex, verts);
-+    }
-+
-+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
-+
-+    switch(primIndex)
-+    {
-+    case 0:
-+    case 1: 
-+        verts[0] = verts[1] = verts[2] = swizzleLane4(a); break;
-+    case 2:
-+    case 3:
-+        verts[0] = verts[1] = verts[2] = swizzleLane5(a); break;
-+    case 4:
-+    case 5:
-+        verts[0] = verts[1] = verts[2] = swizzleLane6(a); break;
-+    case 6:
-+    case 7:
-+        verts[0] = verts[1] = verts[2] = swizzleLane7(a); break;
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief State 1 for RECT_LIST topology.
-+///        There is not enough to assemble 8 triangles.
-+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
-+{
-+    SetNextPaState(pa, PaRectList1, PaRectListSingle0);
-+    return false;
-+}
-+ 
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief State 1 for RECT_LIST topology.
-+///   Rect lists has the following format.
-+///             w          x          y           z
-+///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
-+///         | \ |      | \ |      | \ |       | \ |
-+///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
-+///            v0         v3         v6          v9
-+/// 
-+///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
-+/// 
-+///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
-+///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
-+///   etc.
-+/// 
-+///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
-+///   where v0 contains all the first vertices for 8 triangles.
-+/// 
-+///     Result:
-+///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
-+///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
-+///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-+///
-+/// @param pa - State for PA state machine.
-+/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-+bool PaRectList1(
-+    PA_STATE_OPT& pa,
-+    uint32_t slot,
-+    simdvector verts[])
-+{
-+    // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
-+    simdvector& a = PaGetSimdVector(pa, 0, slot);   // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
-+    simdvector& b = PaGetSimdVector(pa, 1, slot);   // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
-+
-+    __m256 tmp0, tmp1, tmp2;
-+
-+    // Loop over each component in the simdvector.
-+    for(int i = 0; i < 4; ++i)
-+    {
-+        simdvector& v0 = verts[0];                          // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
-+        tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);  // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
-+        v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20);        //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
-+        tmp1  = _mm256_permute_ps(v0[i], 0xF0);           // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
-+        v0[i] = _mm256_permute_ps(v0[i], 0x5A);           //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
-+        v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0);       //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
-+
-+        /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
-+        ///      AVX2 should make this much cheaper.
-+        simdvector& v1 = verts[1];                          // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
-+        v1[i] = _mm256_permute_ps(a[i], 0x09);            //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
-+        tmp1  = _mm256_permute_ps(a[i], 0x43);            // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
-+        tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);       // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
-+        tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);  // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
-+        v1[i] = _mm256_permute_ps(tmp0, 0xE0);            //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
-+        v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0);       //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
-+        v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C);       //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
-+
-+        // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-+        simdvector& v2 = verts[2];                          // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
-+        v2[i] = _mm256_permute_ps(tmp0, 0x30);            //   v2 = { *, *, *, *, v8, *, v11, * }
-+        tmp1  = _mm256_permute_ps(tmp2, 0x31);            // tmp1 = { v2, *, v5, *, *, *, *, * }
-+        v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
-+
-+        // Need to compute 4th implied vertex for the rectangle.
-+        tmp2  = _mm256_sub_ps(v0[i], v1[i]);
-+        tmp2  = _mm256_add_ps(tmp2, v2[i]);               // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
-+        tmp2  = _mm256_permute_ps(tmp2, 0xA0);            // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
-+        v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA);       //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
-+    }
-+
-+    SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
-+    return true;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief State 2 for RECT_LIST topology.
-+///        Not implemented unless there is a use case for more then 8 rects.
-+/// @param pa - State for PA state machine.
-+/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-+bool PaRectList2(
-+    PA_STATE_OPT& pa,
-+    uint32_t slot,
-+    simdvector verts[])
-+{
-+    SWR_ASSERT(0); // Is rect list used for anything other then clears?
-+    SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
-+    return true;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief This procedure is called by the Binner to assemble the attributes.
-+///        Unlike position, which is stored vertically, the attributes are
-+///        stored horizontally. The outputs from the VS, labeled as 'a' and
-+///        'b' are vertical. This function needs to transpose the lanes
-+///        containing the vertical attribute data into horizontal form.
-+/// @param pa - State for PA state machine.
-+/// @param slot - Index into VS output for a given attribute.
-+/// @param primIndex - Binner processes each triangle individually.
-+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-+void PaRectListSingle0(
-+    PA_STATE_OPT& pa,
-+    uint32_t slot,
-+    uint32_t primIndex,
-+    __m128 verts[])
-+{
-+    // We have 12 simdscalars contained within 3 simdvectors which
-+    // hold at least 8 triangles worth of data. We want to assemble a single
-+    // triangle with data in horizontal form.
-+    simdvector& a = PaGetSimdVector(pa, 0, slot);
-+
-+    // Convert from vertical to horizontal.
-+    switch(primIndex)
-+    {
-+    case 0:
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane1(a);
-+        verts[2] = swizzleLane2(a);
-+        break;
-+    case 1:
-+        verts[0] = swizzleLane0(a);
-+        verts[1] = swizzleLane2(a);
-+        verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2);
-+        break;
-+    case 2:
-+    case 3:
-+    case 4:
-+    case 5:
-+    case 6:
-+    case 7:
-+        SWR_ASSERT(0);
-+        break;
-+    };
-+}
-+
-+PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, 
-+    bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
-+    cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
-+{
-+    const API_STATE& state = GetApiState(pDC);
-+
-+    this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
-+
-+    switch (this->binTopology)
-+    {
-+        case TOP_TRIANGLE_LIST:
-+            this->pfnPaFunc = PaTriList0;
-+            break;
-+        case TOP_TRIANGLE_STRIP:
-+            this->pfnPaFunc = PaTriStrip0;
-+            break;
-+        case TOP_TRIANGLE_FAN:
-+            this->pfnPaFunc = PaTriFan0;
-+            break;
-+        case TOP_QUAD_LIST:
-+            this->pfnPaFunc = PaQuadList0;
-+            this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
-+            break;
-+        case TOP_QUAD_STRIP:
-+            // quad strip pattern when decomposed into triangles is the same as verts strips
-+            this->pfnPaFunc = PaTriStrip0;
-+            this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
-+            break;
-+        case TOP_LINE_LIST:
-+            this->pfnPaFunc = PaLineList0;
-+            this->numPrims = in_numPrims;
-+            break;
-+        case TOP_LINE_STRIP:
-+            this->pfnPaFunc = PaLineStrip0;
-+            this->numPrims = in_numPrims;
-+            break;
-+        case TOP_LINE_LOOP:
-+            this->pfnPaFunc = PaLineLoop0;
-+            this->numPrims = in_numPrims;
-+            break;
-+        case TOP_POINT_LIST:
-+            // use point binner and rasterizer if supported
-+            if (CanUseSimplePoints(pDC))
-+            {
-+                this->pfnPaFunc = PaPoints0;
-+                this->numPrims = in_numPrims;
-+            }
-+            else
-+            {
-+                this->pfnPaFunc = PaTriPoints0;
-+                this->numPrims = in_numPrims * 2; // 1 point generates 2 tris
-+            }
-+            break;
-+        case TOP_RECT_LIST:
-+            this->pfnPaFunc = PaRectList0;
-+            this->numPrims = in_numPrims * 2;
-+            break;
-+
-+        case TOP_PATCHLIST_1:
-+            this->pfnPaFunc = PaPatchList<1>;
-+            break;
-+        case TOP_PATCHLIST_2:
-+            this->pfnPaFunc = PaPatchList<2>;
-+            break;
-+        case TOP_PATCHLIST_3:
-+            this->pfnPaFunc = PaPatchList<3>;
-+            break;
-+        case TOP_PATCHLIST_4:
-+            this->pfnPaFunc = PaPatchList<4>;
-+            break;
-+        case TOP_PATCHLIST_5:
-+            this->pfnPaFunc = PaPatchList<5>;
-+            break;
-+        case TOP_PATCHLIST_6:
-+            this->pfnPaFunc = PaPatchList<6>;
-+            break;
-+        case TOP_PATCHLIST_7:
-+            this->pfnPaFunc = PaPatchList<7>;
-+            break;
-+        case TOP_PATCHLIST_8:
-+            this->pfnPaFunc = PaPatchList<8>;
-+            break;
-+        case TOP_PATCHLIST_9:
-+            this->pfnPaFunc = PaPatchList<9>;
-+            break;
-+        case TOP_PATCHLIST_10:
-+            this->pfnPaFunc = PaPatchList<10>;
-+            break;
-+        case TOP_PATCHLIST_11:
-+            this->pfnPaFunc = PaPatchList<11>;
-+            break;
-+        case TOP_PATCHLIST_12:
-+            this->pfnPaFunc = PaPatchList<12>;
-+            break;
-+        case TOP_PATCHLIST_13:
-+            this->pfnPaFunc = PaPatchList<13>;
-+            break;
-+        case TOP_PATCHLIST_14:
-+            this->pfnPaFunc = PaPatchList<14>;
-+            break;
-+        case TOP_PATCHLIST_15:
-+            this->pfnPaFunc = PaPatchList<15>;
-+            break;
-+        case TOP_PATCHLIST_16:
-+            this->pfnPaFunc = PaPatchList<16>;
-+            break;
-+        case TOP_PATCHLIST_17:
-+            this->pfnPaFunc = PaPatchList<17>;
-+            break;
-+        case TOP_PATCHLIST_18:
-+            this->pfnPaFunc = PaPatchList<18>;
-+            break;
-+        case TOP_PATCHLIST_19:
-+            this->pfnPaFunc = PaPatchList<19>;
-+            break;
-+        case TOP_PATCHLIST_20:
-+            this->pfnPaFunc = PaPatchList<20>;
-+            break;
-+        case TOP_PATCHLIST_21:
-+            this->pfnPaFunc = PaPatchList<21>;
-+            break;
-+        case TOP_PATCHLIST_22:
-+            this->pfnPaFunc = PaPatchList<22>;
-+            break;
-+        case TOP_PATCHLIST_23:
-+            this->pfnPaFunc = PaPatchList<23>;
-+            break;
-+        case TOP_PATCHLIST_24:
-+            this->pfnPaFunc = PaPatchList<24>;
-+            break;
-+        case TOP_PATCHLIST_25:
-+            this->pfnPaFunc = PaPatchList<25>;
-+            break;
-+        case TOP_PATCHLIST_26:
-+            this->pfnPaFunc = PaPatchList<26>;
-+            break;
-+        case TOP_PATCHLIST_27:
-+            this->pfnPaFunc = PaPatchList<27>;
-+            break;
-+        case TOP_PATCHLIST_28:
-+            this->pfnPaFunc = PaPatchList<28>;
-+            break;
-+        case TOP_PATCHLIST_29:
-+            this->pfnPaFunc = PaPatchList<29>;
-+            break;
-+        case TOP_PATCHLIST_30:
-+            this->pfnPaFunc = PaPatchList<30>;
-+            break;
-+        case TOP_PATCHLIST_31:
-+            this->pfnPaFunc = PaPatchList<31>;
-+            break;
-+        case TOP_PATCHLIST_32:
-+            this->pfnPaFunc = PaPatchList<32>;
-+            break;
-+
-+        default:
-+            SWR_ASSERT(0);
-+            break;
-+    };
-+
-+    //    simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
-+    //    simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
-+    simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-+    simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
-+
-+    switch(this->binTopology)
-+    {
-+        case TOP_TRIANGLE_LIST:
-+        case TOP_TRIANGLE_STRIP:
-+        case TOP_TRIANGLE_FAN:
-+        case TOP_LINE_STRIP:
-+        case TOP_LINE_LIST:
-+        case TOP_LINE_LOOP:
-+            this->primIDIncr = 8;
-+            this->primID = id8;
-+            break;
-+        case TOP_QUAD_LIST:
-+        case TOP_QUAD_STRIP:
-+        case TOP_RECT_LIST:
-+            this->primIDIncr = 4;
-+            this->primID = id4;
-+            break;
-+        case TOP_POINT_LIST:
-+            if (CanUseSimplePoints(pDC))
-+            {
-+                this->primIDIncr = 8;
-+                this->primID = id8;
-+            }
-+            else
-+            {
-+                this->primIDIncr = 4;
-+                this->primID = id4;
-+            }
-+            break;
-+        case TOP_PATCHLIST_1:
-+        case TOP_PATCHLIST_2:
-+        case TOP_PATCHLIST_3:
-+        case TOP_PATCHLIST_4:
-+        case TOP_PATCHLIST_5:
-+        case TOP_PATCHLIST_6:
-+        case TOP_PATCHLIST_7:
-+        case TOP_PATCHLIST_8:
-+        case TOP_PATCHLIST_9:
-+        case TOP_PATCHLIST_10:
-+        case TOP_PATCHLIST_11:
-+        case TOP_PATCHLIST_12:
-+        case TOP_PATCHLIST_13:
-+        case TOP_PATCHLIST_14:
-+        case TOP_PATCHLIST_15:
-+        case TOP_PATCHLIST_16:
-+        case TOP_PATCHLIST_17:
-+        case TOP_PATCHLIST_18:
-+        case TOP_PATCHLIST_19:
-+        case TOP_PATCHLIST_20:
-+        case TOP_PATCHLIST_21:
-+        case TOP_PATCHLIST_22:
-+        case TOP_PATCHLIST_23:
-+        case TOP_PATCHLIST_24:
-+        case TOP_PATCHLIST_25:
-+        case TOP_PATCHLIST_26:
-+        case TOP_PATCHLIST_27:
-+        case TOP_PATCHLIST_28:
-+        case TOP_PATCHLIST_29:
-+        case TOP_PATCHLIST_30:
-+        case TOP_PATCHLIST_31:
-+        case TOP_PATCHLIST_32:
-+            // Always run KNOB_SIMD_WIDTH number of patches at a time.
-+            this->primIDIncr = 8;
-+            this->primID = id8;
-+            break;
-+
-+        default:
-+            SWR_ASSERT(0);
-+            break;
-+    };
-+
-+}
-+#endif
-diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
-new file mode 100644
-index 0000000..71de298
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
-@@ -0,0 +1,1217 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file rasterizer.cpp
-+*
-+* @brief Implementation for the rasterizer.
-+*
-+******************************************************************************/
-+
-+#include <vector>
-+#include <algorithm>
-+
-+#include "rasterizer.h"
-+#include "multisample.h"
-+#include "rdtsc_core.h"
-+#include "backend.h"
-+#include "utils.h"
-+#include "frontend.h"
-+#include "tilemgr.h"
-+#include "memory/tilingtraits.h"
-+
-+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, 
-+    uint32_t numSamples, uint32_t renderTargetArrayIndex);
-+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep);
-+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, 
-+                     uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep);
-+
-+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
-+const __m128 gMaskToVec[] = {
-+    MASKTOVEC(0,0,0,0),
-+    MASKTOVEC(0,0,0,1),
-+    MASKTOVEC(0,0,1,0),
-+    MASKTOVEC(0,0,1,1),
-+    MASKTOVEC(0,1,0,0),
-+    MASKTOVEC(0,1,0,1),
-+    MASKTOVEC(0,1,1,0),
-+    MASKTOVEC(0,1,1,1),
-+    MASKTOVEC(1,0,0,0),
-+    MASKTOVEC(1,0,0,1),
-+    MASKTOVEC(1,0,1,0),
-+    MASKTOVEC(1,0,1,1),
-+    MASKTOVEC(1,1,0,0),
-+    MASKTOVEC(1,1,0,1),
-+    MASKTOVEC(1,1,1,0),
-+    MASKTOVEC(1,1,1,1),
-+};
-+
-+const __m256d gMaskToVecpd[] =
-+{
-+    MASKTOVEC(0, 0, 0, 0),
-+    MASKTOVEC(0, 0, 0, 1),
-+    MASKTOVEC(0, 0, 1, 0),
-+    MASKTOVEC(0, 0, 1, 1),
-+    MASKTOVEC(0, 1, 0, 0),
-+    MASKTOVEC(0, 1, 0, 1),
-+    MASKTOVEC(0, 1, 1, 0),
-+    MASKTOVEC(0, 1, 1, 1),
-+    MASKTOVEC(1, 0, 0, 0),
-+    MASKTOVEC(1, 0, 0, 1),
-+    MASKTOVEC(1, 0, 1, 0),
-+    MASKTOVEC(1, 0, 1, 1),
-+    MASKTOVEC(1, 1, 0, 0),
-+    MASKTOVEC(1, 1, 0, 1),
-+    MASKTOVEC(1, 1, 1, 0),
-+    MASKTOVEC(1, 1, 1, 1),
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief rasterize a raster tile partially covered by the triangle
-+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
-+/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
-+/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
-+///        Used to step between quads when sweeping over the raster tile.
-+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, __m256d vEdge0, __m256d vEdge1, __m256d vEdge2,
-+    __m128i &vA, __m128i &vB, __m256d &vStepQuad0, __m256d &vStepQuad1, __m256d &vStepQuad2)
-+{
-+    uint64_t coverageMask = 0;
-+
-+    // Step to the pixel sample locations of the 1st quad
-+    double edge0;
-+    double edge1;
-+    double edge2;
-+    _mm_store_sd(&edge0, _mm256_castpd256_pd128(vEdge0));
-+    _mm_store_sd(&edge1, _mm256_castpd256_pd128(vEdge1));
-+    _mm_store_sd(&edge2, _mm256_castpd256_pd128(vEdge2));
-+
-+    vEdge0 = _mm256_broadcast_sd(&edge0);
-+    vEdge1 = _mm256_broadcast_sd(&edge1);
-+    vEdge2 = _mm256_broadcast_sd(&edge2);
-+
-+    vEdge0 = _mm256_add_pd(vEdge0, vStepQuad0);
-+    vEdge1 = _mm256_add_pd(vEdge1, vStepQuad1);
-+    vEdge2 = _mm256_add_pd(vEdge2, vStepQuad2);
-+
-+    // compute step to next quad (mul by 2 in x and y direction)
-+    __m256d vAEdge0 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 0, 0, 0)));
-+    __m256d vAEdge1 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(1, 1, 1, 1)));
-+    __m256d vAEdge2 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(2, 2, 2, 2)));
-+    __m256d vBEdge0 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 0, 0, 0)));
-+    __m256d vBEdge1 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(1, 1, 1, 1)));
-+    __m256d vBEdge2 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(2, 2, 2, 2)));
-+
-+    __m256d vStep0X = _mm256_mul_pd(vAEdge0, _mm256_set1_pd(2 * FIXED_POINT_SCALE));
-+    __m256d vStep0Y = _mm256_mul_pd(vBEdge0, _mm256_set1_pd(2 * FIXED_POINT_SCALE));
-+
-+    __m256d vStep1X = _mm256_mul_pd(vAEdge1, _mm256_set1_pd(2 * FIXED_POINT_SCALE));
-+    __m256d vStep1Y = _mm256_mul_pd(vBEdge1, _mm256_set1_pd(2 * FIXED_POINT_SCALE));
-+
-+    __m256d vStep2X = _mm256_mul_pd(vAEdge2, _mm256_set1_pd(2 * FIXED_POINT_SCALE));
-+    __m256d vStep2Y = _mm256_mul_pd(vBEdge2, _mm256_set1_pd(2 * FIXED_POINT_SCALE));
-+
-+    // fast unrolled version for 8x8 tile
-+#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
-+    int mask0, mask1, mask2;
-+    uint64_t mask;
-+
-+        // evaluate which pixels in the quad are covered
-+#define EVAL \
-+            mask0 = _mm256_movemask_pd(vEdge0);\
-+            mask1 = _mm256_movemask_pd(vEdge1);\
-+            mask2 = _mm256_movemask_pd(vEdge2);
-+
-+        // update coverage mask
-+#define UPDATE_MASK(bit) \
-+            mask = mask0 & mask1 & mask2;\
-+            coverageMask |= (mask << bit);
-+
-+        // step in the +x direction to the next quad 
-+#define INCX \
-+            vEdge0 = _mm256_add_pd(vEdge0, vStep0X);\
-+            vEdge1 = _mm256_add_pd(vEdge1, vStep1X);\
-+            vEdge2 = _mm256_add_pd(vEdge2, vStep2X);
-+        // step in the +y direction to the next quad 
-+#define INCY \
-+        vEdge0 = _mm256_add_pd(vEdge0, vStep0Y);\
-+        vEdge1 = _mm256_add_pd(vEdge1, vStep1Y);\
-+        vEdge2 = _mm256_add_pd(vEdge2, vStep2Y);
-+        // step in the -x direction to the next quad 
-+#define DECX \
-+            vEdge0 = _mm256_sub_pd(vEdge0, vStep0X);\
-+            vEdge1 = _mm256_sub_pd(vEdge1, vStep1X);\
-+            vEdge2 = _mm256_sub_pd(vEdge2, vStep2X);
-+
-+    // sweep 2x2 quad back and forth through the raster tile, 
-+    // computing coverage masks for the entire tile
-+    
-+    // raster tile
-+    // 0  1  2  3  4  5  6  7 
-+    // x  x
-+    // x  x ------------------>  
-+    //                   x  x  |
-+    // <-----------------x  x  V
-+    // ..
-+
-+    // row 0
-+    EVAL;
-+    UPDATE_MASK(0);
-+    INCX;
-+    EVAL;
-+    UPDATE_MASK(4);
-+    INCX;
-+    EVAL;
-+    UPDATE_MASK(8);
-+    INCX;
-+    EVAL;
-+    UPDATE_MASK(12);
-+    INCY;
-+
-+    //row 1
-+    EVAL;
-+    UPDATE_MASK(28);
-+    DECX;
-+    EVAL;
-+    UPDATE_MASK(24);
-+    DECX;
-+    EVAL;
-+    UPDATE_MASK(20);
-+    DECX;
-+    EVAL;
-+    UPDATE_MASK(16);
-+    INCY;
-+
-+    // row 2
-+    EVAL;
-+    UPDATE_MASK(32);
-+    INCX;
-+    EVAL;
-+    UPDATE_MASK(36);
-+    INCX;
-+    EVAL;
-+    UPDATE_MASK(40);
-+    INCX;
-+    EVAL;
-+    UPDATE_MASK(44);
-+    INCY;
-+
-+    // row 3
-+    EVAL;
-+    UPDATE_MASK(60);
-+    DECX;
-+    EVAL;
-+    UPDATE_MASK(56);
-+    DECX;
-+    EVAL;
-+    UPDATE_MASK(52);
-+    DECX;
-+    EVAL;
-+    UPDATE_MASK(48);
-+#else
-+    uint32_t bit = 0;
-+    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
-+    {
-+        __m256d vStartOfRowEdge0 = vEdge0;
-+        __m256d vStartOfRowEdge1 = vEdge1;
-+        __m256d vStartOfRowEdge2 = vEdge2;
-+
-+        for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
-+        {
-+            int mask0 = _mm256_movemask_pd(vEdge0);
-+            int mask1 = _mm256_movemask_pd(vEdge1);
-+            int mask2 = _mm256_movemask_pd(vEdge2);
-+
-+            uint64_t mask = mask0 & mask1 & mask2;
-+            coverageMask |= (mask << bit);
-+
-+            // step to the next pixel in the x
-+            vEdge0 = _mm256_add_pd(vEdge0, vStep0X);
-+            vEdge1 = _mm256_add_pd(vEdge1, vStep1X);
-+            vEdge2 = _mm256_add_pd(vEdge2, vStep2X);
-+            bit+=4;
-+        }
-+
-+        // step to the next row
-+        vEdge0 = _mm256_add_pd(vStartOfRowEdge0, vStep0Y);
-+        vEdge1 = _mm256_add_pd(vStartOfRowEdge1, vStep1Y);
-+        vEdge2 = _mm256_add_pd(vStartOfRowEdge2, vStep2Y);
-+    }
-+#endif
-+    return coverageMask;
-+
-+}
-+// Top left rule:
-+// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
-+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
-+// Top left: a sample is in if it is a top or left edge.
-+// Out: !(horizontal && above) = !horizontal && below
-+// Out: !horizontal && left = !(!horizontal && left) = horizontal and right 
-+INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, const __m256d vEdge)
-+{
-+    // if vA < 0, vC--
-+    // if vA == 0 && vB < 0, vC--
-+
-+    __m256d vEdgeOut = vEdge;
-+    __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
-+
-+    // if vA < 0 (line is not horizontal and below)
-+    int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
-+
-+    // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
-+    __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
-+    int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
-+    msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
-+
-+    // if either of these are true and we're on the line (edge == 0), bump it outside the line
-+    vEdgeOut = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
-+    return vEdgeOut;
-+}
-+
-+// max(abs(dz/dx), abs(dz,dy)
-+INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
-+{
-+    /*
-+    // evaluate i,j at (0,0)
-+    float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-+    float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-+
-+    // evaluate i,j at (1,0)
-+    float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-+    float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-+
-+    // compute dz/dx
-+    float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
-+    float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
-+    float dzdx = abs(d10 - d00);
-+
-+    // evaluate i,j at (0,1)
-+    float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
-+    float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
-+
-+    float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
-+    float dzdy = abs(d01 - d00);
-+    */
-+
-+    // optimized version of above
-+    float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
-+    float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
-+
-+    return std::max(dzdx, dzdy);
-+}
-+
-+INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
-+{
-+    if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
-+    {
-+        return (1.0f / (1 << 24));
-+    }
-+    else if (pState->depthFormat == R16_UNORM)
-+    {
-+        return (1.0f / (1 << 16));
-+    }
-+    else
-+    {
-+        SWR_ASSERT(pState->depthFormat == R32_FLOAT);
-+
-+        // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
-+        float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
-+        uint32_t zMaxInt = *(uint32_t*)&zMax;
-+        zMaxInt &= 0x7f800000;
-+        zMax = *(float*)&zMaxInt;
-+
-+        return zMax * (1.0f / (1 << 23));
-+    }
-+}
-+
-+INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
-+{
-+    if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
-+    {
-+        return 0.0f;
-+    }
-+
-+    float scale = pState->slopeScaledDepthBias;
-+    if (scale != 0.0f)
-+    {
-+        scale *= ComputeMaxDepthSlope(pTri);
-+    }
-+
-+    float bias = pState->depthBias * ComputeBiasFactor(pState, pTri, z) + scale;
-+    if (pState->depthBiasClamp > 0.0f)
-+    {
-+        bias = std::min(bias, pState->depthBiasClamp);
-+    }
-+    else if (pState->depthBiasClamp < 0.0f)
-+    {
-+        bias = std::max(bias, pState->depthBiasClamp);
-+    }
-+
-+    return bias;
-+}
-+
-+// Prevent DCE by writing coverage mask from rasterizer to volatile
-+#if KNOB_ENABLE_TOSS_POINTS
-+__declspec(thread) volatile uint64_t gToss;
-+#endif
-+
-+static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
-+// try to avoid _chkstk insertions; make this thread local
-+static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
-+
-+template<SWR_MULTISAMPLE_COUNT sampleCount>
-+void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
-+{
-+
-+    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
-+#if KNOB_ENABLE_TOSS_POINTS
-+    if (KNOB_TOSS_BIN_TRIS)
-+    {
-+        return;
-+    }
-+#endif
-+    RDTSC_START(BERasterizeTriangle);
-+
-+    RDTSC_START(BETriangleSetup);
-+    const API_STATE &state = GetApiState(pDC);
-+    const SWR_RASTSTATE &rastState = state.rastState;
-+
-+    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
-+    triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-+
-+    __m128 vX, vY, vZ, vRecipW;
-+    
-+    // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
-+    // eg: vX = [x0 x1 x2 dc]
-+    vX = _mm_load_ps(workDesc.pTriBuffer);
-+    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
-+    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
-+    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-+
-+    // convert to fixed point
-+    __m128i vXi = fpToFixedPoint(vX);
-+    __m128i vYi = fpToFixedPoint(vY);
-+
-+    // quantize floating point position to fixed point precision
-+    // to prevent attribute creep around the triangle vertices
-+    vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-+    vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-+
-+    // triangle setup - A and B edge equation coefs
-+    __m128 vA, vB;
-+    triangleSetupAB(vX, vY, vA, vB);
-+
-+    __m128i vAi, vBi;
-+    triangleSetupABInt(vXi, vYi, vAi, vBi);
-+    
-+    // determinant
-+    float det = calcDeterminantInt(vAi, vBi);
-+
-+    /// @todo: This test is flipped...we have a stray '-' sign somewhere
-+    // Convert CW triangles to CCW
-+    if (det > 0.0)
-+    {
-+        vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
-+        vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
-+        vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
-+        vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
-+        det = -det;
-+    }
-+
-+    __m128 vC;
-+    // Finish triangle setup - C edge coef
-+    triangleSetupC(vX, vY, vA, vB, vC);
-+
-+    // compute barycentric i and j
-+    // i = (A1x + B1y + C1)/det
-+    // j = (A2x + B2y + C2)/det
-+    __m128 vDet = _mm_set1_ps(det);
-+    __m128 vRecipDet = _mm_div_ps(_mm_set1_ps(1.0f), vDet);//_mm_rcp_ps(vDet);
-+    _mm_store_ss(&triDesc.recipDet, vRecipDet);
-+
-+    // only extract coefs for 2 of the barycentrics; the 3rd can be 
-+    // determined from the barycentric equation:
-+    // i + j + k = 1 <=> k = 1 - j - i
-+    _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
-+    _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
-+    _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
-+    _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
-+    _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
-+    _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
-+
-+    OSALIGN(float, 16) oneOverW[4];
-+    _mm_store_ps(oneOverW, vRecipW);
-+    triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
-+    triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
-+    triDesc.OneOverW[2] = oneOverW[2];
-+
-+    // calculate perspective correct coefs per vertex attrib 
-+    float* pPerspAttribs = perspAttribsTLS;
-+    float* pAttribs = workDesc.pAttribs;
-+    triDesc.pPerspAttribs = pPerspAttribs;
-+    triDesc.pAttribs = pAttribs;
-+    float *pRecipW = workDesc.pTriBuffer + 12;
-+    __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
-+    __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
-+    __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
-+    for(uint32_t i = 0; i < workDesc.numAttribs; i++)
-+    {
-+        __m128 attribA = _mm_load_ps(pAttribs);
-+        __m128 attribB = _mm_load_ps(pAttribs+=4);
-+        __m128 attribC = _mm_load_ps(pAttribs+=4);
-+        pAttribs+=4;
-+
-+        attribA = _mm_mul_ps(attribA, vOneOverWV0);
-+        attribB = _mm_mul_ps(attribB, vOneOverWV1);
-+        attribC = _mm_mul_ps(attribC, vOneOverWV2);
-+
-+        _mm_store_ps(pPerspAttribs, attribA);
-+        _mm_store_ps(pPerspAttribs+=4, attribB);
-+        _mm_store_ps(pPerspAttribs+=4, attribC);
-+        pPerspAttribs+=4;
-+    }
-+
-+    // compute bary Z
-+    // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
-+    OSALIGN(float, 16) a[4];
-+    _mm_store_ps(a, vZ);
-+    triDesc.Z[0] = a[0] - a[2];
-+    triDesc.Z[1] = a[1] - a[2];
-+    triDesc.Z[2] = a[2];
-+        
-+    // add depth bias
-+    triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
-+
-+    // broadcast A and B coefs for each edge to all slots
-+    __m128i vAEdge0h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(0,0,0,0));
-+    __m128i vAEdge1h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(1,1,1,1));
-+    __m128i vAEdge2h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(2,2,2,2));
-+    __m128i vBEdge0h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(0,0,0,0));
-+    __m128i vBEdge1h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(1,1,1,1));
-+    __m128i vBEdge2h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(2,2,2,2));
-+
-+    __m256d vAEdge0Fix8 = _mm256_cvtepi32_pd(vAEdge0h);
-+    __m256d vAEdge1Fix8 = _mm256_cvtepi32_pd(vAEdge1h);
-+    __m256d vAEdge2Fix8 = _mm256_cvtepi32_pd(vAEdge2h);
-+    __m256d vBEdge0Fix8 = _mm256_cvtepi32_pd(vBEdge0h);
-+    __m256d vBEdge1Fix8 = _mm256_cvtepi32_pd(vBEdge1h);
-+    __m256d vBEdge2Fix8 = _mm256_cvtepi32_pd(vBEdge2h);
-+
-+    // Precompute pixel quad step offsets
-+    // 0,0  ------  1,0
-+    //     |      |
-+    //     |      |
-+    // 1,0  ------  1,1
-+    const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
-+    const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
-+
-+    // Evaluate edge equations at 4 upper left corners of a 2x2 pixel quad
-+    // used to step between quads while sweeping over a raster tile
-+    __m256d vQuadStepX0Fix16 = _mm256_mul_pd(vAEdge0Fix8, vQuadOffsetsXIntFix8);
-+    __m256d vQuadStepX1Fix16 = _mm256_mul_pd(vAEdge1Fix8, vQuadOffsetsXIntFix8);
-+    __m256d vQuadStepX2Fix16 = _mm256_mul_pd(vAEdge2Fix8, vQuadOffsetsXIntFix8);
-+
-+    __m256d vQuadStepY0Fix16 = _mm256_mul_pd(vBEdge0Fix8, vQuadOffsetsYIntFix8);
-+    __m256d vQuadStepY1Fix16 = _mm256_mul_pd(vBEdge1Fix8, vQuadOffsetsYIntFix8);
-+    __m256d vQuadStepY2Fix16 = _mm256_mul_pd(vBEdge2Fix8, vQuadOffsetsYIntFix8);
-+
-+    // vStepQuad = A*vQuadOffsetsXInt + B*vQuadOffsetsYInt
-+    __m256d vStepQuad0Fix16 = _mm256_add_pd(vQuadStepX0Fix16, vQuadStepY0Fix16);
-+    __m256d vStepQuad1Fix16 = _mm256_add_pd(vQuadStepX1Fix16, vQuadStepY1Fix16);
-+    __m256d vStepQuad2Fix16 = _mm256_add_pd(vQuadStepX2Fix16, vQuadStepY2Fix16);
-+
-+    // Precompute tile step offsets
-+    //                 0,0  ------  KNOB_TILE_X_DIM-1,0
-+    //                     |      |
-+    //                     |      |
-+    // KNOB_TILE_Y_DIM-1,0  ------  KNOB_TILE_X_DIM-1,KNOB_TILE_Y_DIM-1
-+    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM-1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM-1)*FIXED_POINT_SCALE, 0);
-+    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM-1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM-1)*FIXED_POINT_SCALE, 0, 0);
-+
-+    // Calc bounding box of triangle
-+    OSALIGN(BBOX, 16) bbox;
-+    calcBoundingBoxInt(vXi, vYi, bbox);
-+
-+    // Intersect with scissor/viewport
-+    bbox.left = std::max(bbox.left, state.scissorInFixedPoint.left);
-+    bbox.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right);
-+    bbox.top = std::max(bbox.top, state.scissorInFixedPoint.top);
-+    bbox.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom);
-+
-+    triDesc.triFlags = workDesc.triFlags;
-+
-+    // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
-+    uint32_t macroX, macroY;
-+    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-+    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-+    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-+    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
-+    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-+
-+    OSALIGN(BBOX, 16) intersect;
-+    intersect.left   = std::max(bbox.left, macroBoxLeft);
-+    intersect.top    = std::max(bbox.top, macroBoxTop);
-+    intersect.right  = std::min(bbox.right, macroBoxRight);
-+    intersect.bottom = std::min(bbox.bottom, macroBoxBottom);
-+
-+    SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0);
-+
-+    RDTSC_STOP(BETriangleSetup, 0, pDC->drawId);
-+
-+    // update triangle desc
-+    uint32_t tileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    uint32_t tileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-+    uint32_t numTilesX = maxTileX - tileX + 1;
-+    uint32_t numTilesY = maxTileY - tileY + 1;
-+
-+    if (numTilesX == 0 || numTilesY == 0) 
-+    {
-+        RDTSC_EVENT(BEEmptyTriangle, 1, 0);
-+        RDTSC_STOP(BERasterizeTriangle, 1, 0);
-+        return;
-+    }
-+
-+    RDTSC_START(BEStepSetup);
-+
-+    // Step to pixel center of top-left pixel of the triangle bbox
-+    // Align intersect bbox (top/left) to raster tile's (top/left).
-+    int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
-+    int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
-+
-+    if(sampleCount == SWR_MULTISAMPLE_1X)
-+    {
-+        // Add 0.5, in fixed point, to offset to pixel center
-+        x += (FIXED_POINT_SCALE / 2);
-+        y += (FIXED_POINT_SCALE / 2);
-+    }
-+
-+    __m128i vTopLeftX = _mm_set1_epi32(x);
-+    __m128i vTopLeftY = _mm_set1_epi32(y);
-+
-+    // evaluate edge equations at top-left pixel using 64bit math
-+    // all other evaluations will be 32bit steps from it
-+    // small triangles could skip this and do all 32bit math
-+    // edge 0
-+    // 
-+    // line = Ax + By + C
-+    // solving for C:
-+    // C = -Ax - By
-+    // we know x0 and y0 are on the line; plug them in:
-+    // C = -Ax0 - By0
-+    // plug C back into line equation:
-+    // line = Ax - Bx - Ax0 - Bx1
-+    // line = A(x - x0) + B(y - y0)
-+    // line = A(x0+dX) + B(y0+dY) + C = Ax0 + AdX + By0 + BdY + c = AdX + BdY
-+
-+    // edge 0 and 1
-+    // edge0 = A0(x - x0) + B0(y - y0)
-+    // edge1 = A1(x - x1) + B1(y - y1)
-+    __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
-+    __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
-+
-+    __m256d vEdgeFix16[3];
-+
-+    // evaluate A(dx) and B(dY) for all points
-+    __m256d vAipd = _mm256_cvtepi32_pd(vAi);
-+    __m256d vBipd = _mm256_cvtepi32_pd(vBi);
-+    __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
-+    __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
-+
-+    __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
-+    __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
-+    __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
-+
-+    // adjust for top-left rule
-+    vEdge = adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-+
-+    // broadcast respective edge results to all lanes
-+    double* pEdge = (double*)&vEdge;
-+    vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
-+    vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
-+    vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
-+
-+    // compute step to the next tile
-+    __m256d vNextXTileFix8 = _mm256_set1_pd(KNOB_TILE_X_DIM * FIXED_POINT_SCALE);
-+    __m256d vNextYTileFix8 = _mm256_set1_pd(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE);
-+    __m256d vTileStepX0Fix16 = _mm256_mul_pd(vAEdge0Fix8, vNextXTileFix8);
-+    __m256d vTileStepY0Fix16 = _mm256_mul_pd(vBEdge0Fix8, vNextYTileFix8);
-+    __m256d vTileStepX1Fix16 = _mm256_mul_pd(vAEdge1Fix8, vNextXTileFix8);
-+    __m256d vTileStepY1Fix16 = _mm256_mul_pd(vBEdge1Fix8, vNextYTileFix8);
-+    __m256d vTileStepX2Fix16 = _mm256_mul_pd(vAEdge2Fix8, vNextXTileFix8);
-+    __m256d vTileStepY2Fix16 = _mm256_mul_pd(vBEdge2Fix8, vNextYTileFix8);
-+
-+    // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
-+    // used to for testing if entire raster tile is inside a triangle
-+    __m256d vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vTileOffsetsXIntFix8);
-+    __m256d vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vTileOffsetsYIntFix8);
-+    vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], _mm256_add_pd(vResultAxFix16, vResultByFix16));
-+
-+    vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vTileOffsetsXIntFix8);
-+    vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vTileOffsetsYIntFix8);
-+    vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], _mm256_add_pd(vResultAxFix16, vResultByFix16));
-+
-+    vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vTileOffsetsXIntFix8);
-+    vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vTileOffsetsYIntFix8);
-+    vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], _mm256_add_pd(vResultAxFix16, vResultByFix16));
-+
-+    // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
-+    // step sample positions to the raster tile bbox of multisample points
-+    // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
-+    //                             |      |
-+    //                             |      |
-+    // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
-+    __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
-+    if(sampleCount > SWR_MULTISAMPLE_1X)
-+    {
-+        __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
-+        __m128i vTileSampleBBoxYh = MultisampleTraits<sampleCount>::TileSampleOffsetsY();
-+
-+        __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
-+        __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
-+
-+        // step edge equation tests from Tile
-+        // used to for testing if entire raster tile is inside a triangle
-+        vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vTileSampleBBoxXFix8);
-+        vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vTileSampleBBoxYFix8);
-+        vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-+
-+        vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vTileSampleBBoxXFix8);
-+        vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vTileSampleBBoxYFix8);
-+        vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-+
-+        vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vTileSampleBBoxXFix8);
-+        vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vTileSampleBBoxYFix8);
-+        vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-+    }
-+    
-+    RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
-+
-+    uint32_t tY = tileY;
-+    uint32_t tX = tileX;
-+    uint32_t maxY = maxTileY;
-+    uint32_t maxX = maxTileX;
-+
-+    triDesc.pSamplePos = pDC->pState->state.samplePos;
-+
-+    // compute steps between raster tiles for render output buffers
-+    static const uint32_t colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
-+    static const uint32_t colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep};
-+    static const uint32_t depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
-+    static const uint32_t depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep};
-+    static const uint32_t stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
-+    static const uint32_t stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep};
-+    RenderOutputBuffers renderBuffers, currentRenderBufferRow;
-+
-+    GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, MultisampleTraits<sampleCount>::numSamples,
-+        triDesc.triFlags.renderTargetArrayIndex);
-+    currentRenderBufferRow = renderBuffers;
-+
-+    // rasterize and generate coverage masks per sample
-+    uint32_t maxSamples = MultisampleTraits<sampleCount>::numSamples;
-+    for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
-+    {
-+        __m256d vStartOfRowEdge0 = vEdgeFix16[0];
-+        __m256d vStartOfRowEdge1 = vEdgeFix16[1];
-+        __m256d vStartOfRowEdge2 = vEdgeFix16[2];
-+
-+        for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
-+        {
-+            uint64_t anyCoveredSamples = 0;
-+
-+            // is the corner of the edge outside of the raster tile? (vEdge < 0)
-+            int mask0, mask1, mask2;
-+            if(sampleCount == SWR_MULTISAMPLE_1X)
-+            {
-+                // is the corner of the edge outside of the raster tile? (vEdge < 0)
-+                mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
-+                mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
-+                mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
-+            }
-+            else
-+            {
-+                __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
-+                // evaluate edge equations at the tile multisample bounding box
-+                vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
-+                vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
-+                vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
-+                mask0 = _mm256_movemask_pd(vSampleBboxTest0);
-+                mask1 = _mm256_movemask_pd(vSampleBboxTest1);
-+                mask2 = _mm256_movemask_pd(vSampleBboxTest2);
-+            }
-+
-+            for (uint32_t sampleNum = 0; sampleNum < maxSamples; sampleNum++)
-+            {
-+                // trivial reject, at least one edge has all 4 corners of raster tile outside
-+                bool trivialReject = (!(mask0 && mask1 && mask2)) ? true : false;
-+
-+                if (!trivialReject)
-+                {
-+                    // trivial accept mask
-+                    triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
-+                    if ((mask0 & mask1 & mask2) == 0xf)
-+                    {
-+                        anyCoveredSamples = triDesc.coverageMask[sampleNum];
-+                        // trivial accept, all 4 corners of all 3 edges are negative 
-+                        // i.e. raster tile completely inside triangle
-+                        RDTSC_EVENT(BETrivialAccept, 1, 0);
-+                    }
-+                    else
-+                    {
-+                        __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; 
-+                        if(sampleCount == SWR_MULTISAMPLE_1X)
-+                        {
-+                            // should get optimized out for single sample case (global value numbering or copy propagation)
-+                            vEdge0AtSample = vEdgeFix16[0];
-+                            vEdge1AtSample = vEdgeFix16[1];
-+                            vEdge2AtSample = vEdgeFix16[2];
-+                        }
-+                        else
-+                        {
-+                            __m128i vSampleOffsetXh = MultisampleTraits<sampleCount>::vXi(sampleNum);
-+                            __m128i vSampleOffsetYh = MultisampleTraits<sampleCount>::vYi(sampleNum);
-+                            __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
-+                            __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
-+
-+                            // *note*: none of this needs to be vectorized as rasterizePartialTile just takes vEdge[0]
-+                            // for each edge and broadcasts it before offsetting to individual pixel quads
-+
-+                            // step edge equation tests from UL tile corner to pixel sample position
-+                            vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vSampleOffsetX);
-+                            vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vSampleOffsetY);
-+                            vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-+                            vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
-+
-+                            vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vSampleOffsetX);
-+                            vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vSampleOffsetY);
-+                            vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-+                            vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
-+
-+                            vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vSampleOffsetX);
-+                            vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vSampleOffsetY);
-+                            vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-+                            vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
-+                        }
-+
-+                        // not trivial accept or reject, must rasterize full tile
-+                        RDTSC_START(BERasterizePartial);
-+                        triDesc.coverageMask[sampleNum] = rasterizePartialTile(pDC, vEdge0AtSample, vEdge1AtSample, vEdge2AtSample,
-+                            vAi, vBi, vStepQuad0Fix16, vStepQuad1Fix16, vStepQuad2Fix16);
-+                        RDTSC_STOP(BERasterizePartial, 0, 0);
-+
-+                        anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
-+                    }
-+                }
-+                else
-+                {
-+                    if(sampleCount > SWR_MULTISAMPLE_1X)
-+                    {
-+                        triDesc.coverageMask[sampleNum] = 0;
-+                    }
-+                    RDTSC_EVENT(BETrivialReject, 1, 0);
-+                }
-+            }
-+
-+#if KNOB_ENABLE_TOSS_POINTS
-+            if(KNOB_TOSS_RS)
-+            {
-+                gToss = triDesc.coverageMask[0];
-+            }
-+            else
-+#endif
-+            if(anyCoveredSamples)
-+            {
-+                RDTSC_START(BEPixelBackend);
-+                pDC->pState->pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
-+                RDTSC_STOP(BEPixelBackend, 0, 0);
-+            }
-+
-+            // step to the next tile in X
-+            vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], vTileStepX0Fix16);
-+            vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], vTileStepX1Fix16);
-+            vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], vTileStepX2Fix16);
-+
-+            StepRasterTileX(state.psState.maxRTSlotUsed, renderBuffers, colorRasterTileStep, depthRasterTileStep, stencilRasterTileStep);
-+        }
-+
-+        // step to the next tile in Y
-+        vEdgeFix16[0] = _mm256_add_pd(vStartOfRowEdge0, vTileStepY0Fix16);
-+        vEdgeFix16[1] = _mm256_add_pd(vStartOfRowEdge1, vTileStepY1Fix16);
-+        vEdgeFix16[2] = _mm256_add_pd(vStartOfRowEdge2, vTileStepY2Fix16);
-+
-+        StepRasterTileY(state.psState.maxRTSlotUsed, renderBuffers, currentRenderBufferRow, colorRasterTileRowStep, depthRasterTileRowStep, stencilRasterTileRowStep);
-+    }
-+
-+    RDTSC_STOP(BERasterizeTriangle, 1, 0);
-+}
-+
-+void RasterizePoint(DRAW_CONTEXT *pDC, uint32_t workerId, const TRIANGLE_WORK_DESC &workDesc, uint32_t macroTile)
-+{
-+#if KNOB_ENABLE_TOSS_POINTS
-+    if (KNOB_TOSS_BIN_TRIS)
-+    {
-+        return;
-+    }
-+#endif
-+
-+    // map x,y relative offsets from start of raster tile to bit position in 
-+    // coverage mask for the point
-+    static const uint32_t coverageMap[8][8] = {
-+        { 0, 1, 4, 5, 8, 9, 12, 13 },
-+        { 2, 3, 6, 7, 10, 11, 14, 15 },
-+        { 16, 17, 20, 21, 24, 25, 28, 29 },
-+        { 18, 19, 22, 23, 26, 27, 30, 31 },
-+        { 32, 33, 36, 37, 40, 41, 44, 45 },
-+        { 34, 35, 38, 39, 42, 43, 46, 47 },
-+        { 48, 49, 52, 53, 56, 57, 60, 61 },
-+        { 50, 51, 54, 55, 58, 59, 62, 63 }
-+    };
-+
-+    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
-+
-+    // pull point information from triangle buffer
-+    // @todo use structs for readability
-+    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
-+    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
-+    float z = *(workDesc.pTriBuffer + 2);
-+
-+    // construct triangle descriptor for point
-+    // no interpolation, set up i,j for constant interpolation of z and attribs
-+    // @todo implement an optimized backend that doesn't require triangle information
-+
-+    // compute coverage mask from x,y packed into the coverageMask flag
-+    // mask indices by the maximum valid index for x/y of coveragemap.
-+    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
-+    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
-+    // todo: multisample points?
-+    triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
-+
-+    // no persp divide needed for points
-+    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
-+    triDesc.triFlags = workDesc.triFlags;
-+    triDesc.recipDet = 1.0f;
-+    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
-+    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
-+    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
-+    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
-+
-+    RenderOutputBuffers renderBuffers;
-+    GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
-+        renderBuffers, 1, triDesc.triFlags.renderTargetArrayIndex);
-+
-+    RDTSC_START(BEPixelBackend);
-+    pDC->pState->pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
-+    RDTSC_STOP(BEPixelBackend, 0, 0);
-+}
-+
-+void rastPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
-+{
-+    TRIANGLE_WORK_DESC *pDesc = (TRIANGLE_WORK_DESC*)pData;
-+    RasterizePoint(pDC, workerId, *pDesc, macroTile);
-+
-+}
-+// Get pointers to hot tile memory for color RT, depth, stencil
-+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, 
-+    uint32_t numSamples, uint32_t renderTargetArrayIndex)
-+{
-+    const API_STATE& state = GetApiState(pDC);
-+    SWR_CONTEXT *pContext = pDC->pContext;
-+    const SWR_DEPTH_STENCIL_STATE *pDSState = &state.depthStencilState;
-+    const uint32_t MaxRT = state.psState.maxRTSlotUsed;
-+
-+    uint32_t mx, my;
-+    MacroTileMgr::getTileIndices(macroID, mx, my);
-+    tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
-+    tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
-+
-+    if(state.psState.pfnPixelShader != NULL)
-+    {
-+        // compute tile offset for active hottile buffers
-+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
-+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-+        offset*=numSamples;
-+        for(uint32_t rt = 0; rt <= MaxRT; ++rt)
-+        {
-+            HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, 
-+                numSamples, renderTargetArrayIndex);
-+            pColor->state = HOTTILE_DIRTY;
-+            renderBuffers.pColor[rt] = pColor->pBuffer + offset;
-+        }
-+    }
-+    if(pDSState->depthTestEnable || pDSState->depthWriteEnable)
-+    {
-+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-+        offset*=numSamples;
-+        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 
-+            numSamples, renderTargetArrayIndex);
-+        pDepth->state = HOTTILE_DIRTY;
-+        SWR_ASSERT(pDepth->pBuffer != nullptr);
-+        renderBuffers.pDepth = pDepth->pBuffer + offset;
-+    }
-+    if(pDSState->stencilTestEnable)
-+    {
-+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
-+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-+        offset*=numSamples;
-+        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 
-+            numSamples, renderTargetArrayIndex);
-+        pStencil->state = HOTTILE_DIRTY;
-+        SWR_ASSERT(pStencil->pBuffer != nullptr);
-+        renderBuffers.pStencil = pStencil->pBuffer + offset;
-+    }
-+}
-+
-+INLINE
-+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep)
-+{
-+    for(uint32_t rt = 0; rt <= MaxRT; ++rt)
-+    {
-+        buffers.pColor[rt] += colorTileStep;
-+    }
-+    
-+    buffers.pDepth += depthTileStep;
-+    buffers.pStencil += stencilTileStep;
-+}
-+
-+INLINE
-+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep)
-+{
-+    for(uint32_t rt = 0; rt <= MaxRT; ++rt)
-+    {
-+        startBufferRow.pColor[rt] += colorRowStep;
-+        buffers.pColor[rt] = startBufferRow.pColor[rt];
-+    }
-+    startBufferRow.pDepth += depthRowStep;
-+    buffers.pDepth = startBufferRow.pDepth;
-+
-+    startBufferRow.pStencil += stencilRowStep;
-+    buffers.pStencil = startBufferRow.pStencil;
-+}
-+
-+// initialize rasterizer function table
-+PFN_WORK_FUNC gRasterizerTable[SWR_MULTISAMPLE_TYPE_MAX] =
-+{
-+    RasterizeTriangle<SWR_MULTISAMPLE_1X>,
-+    RasterizeTriangle<SWR_MULTISAMPLE_2X>,
-+    RasterizeTriangle<SWR_MULTISAMPLE_4X>,
-+    RasterizeTriangle<SWR_MULTISAMPLE_8X>,
-+    RasterizeTriangle<SWR_MULTISAMPLE_16X>
-+};
-+
-+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
-+{
-+    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
-+#if KNOB_ENABLE_TOSS_POINTS
-+    if (KNOB_TOSS_BIN_TRIS)
-+    {
-+        return;
-+    }
-+#endif
-+
-+    // bloat line to two tris and call the triangle rasterizer twice
-+    RDTSC_START(BERasterizeLine);
-+
-+    const API_STATE &state = GetApiState(pDC);
-+
-+    // macrotile dimensioning
-+    uint32_t macroX, macroY;
-+    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-+    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-+    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-+    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
-+    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-+
-+    // create a copy of the triangle buffer to write our adjusted vertices to
-+    OSALIGNSIMD(float) newTriBuffer[4 * 4];
-+    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-+    newWorkDesc.pTriBuffer = &newTriBuffer[0];
-+
-+    // create a copy of the attrib buffer to write our adjusted attribs to
-+    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
-+    newWorkDesc.pAttribs = &newAttribBuffer[0];
-+
-+    const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
-+    const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
-+
-+    __m128 vX, vY, vZ, vRecipW;
-+
-+    vX = _mm_load_ps(workDesc.pTriBuffer);
-+    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
-+    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
-+    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-+
-+    // triangle 0
-+    // v0,v1 -> v0,v0,v1
-+    __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
-+    __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
-+    __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
-+    __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
-+
-+    __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
-+    __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
-+    if (workDesc.triFlags.yMajor)
-+    {
-+        vXa = _mm_add_ps(vAdjust, vXa);
-+    }
-+    else
-+    {
-+        vYa = _mm_add_ps(vAdjust, vYa);
-+    }
-+
-+    // Store triangle description for rasterizer
-+    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-+    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-+    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-+    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-+
-+    // binner bins 3 edges for lines as v0, v1, v1
-+    // tri0 needs v0, v0, v1
-+    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-+    {
-+        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
-+        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
-+
-+        _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
-+        _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
-+        _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
-+    }
-+
-+    // Store user clip distances for triangle 0
-+    float newClipBuffer[3 * 8];
-+    uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
-+    if (numClipDist)
-+    {
-+        newWorkDesc.pUserClipBuffer = newClipBuffer;
-+
-+        float* pOldBuffer = workDesc.pUserClipBuffer;
-+        float* pNewBuffer = newClipBuffer;
-+        for (uint32_t i = 0; i < numClipDist; ++i)
-+        {
-+            // read barycentric coeffs from binner
-+            float a = *(pOldBuffer++);
-+            float b = *(pOldBuffer++);
-+
-+            // reconstruct original clip distance at vertices
-+            float c0 = a + b;
-+            float c1 = b;
-+
-+            // construct triangle barycentrics
-+            *(pNewBuffer++) = c0 - c1;
-+            *(pNewBuffer++) = c0 - c1;
-+            *(pNewBuffer++) = c1;
-+        }
-+    }
-+
-+    // make sure this macrotile intersects the triangle
-+    __m128i vXai = fpToFixedPoint(vXa);
-+    __m128i vYai = fpToFixedPoint(vYa);
-+    OSALIGN(BBOX, 16) bboxA;
-+    calcBoundingBoxInt(vXai, vYai, bboxA);
-+
-+    if (!(bboxA.left > macroBoxRight ||
-+          bboxA.left > state.scissorInFixedPoint.right ||
-+          bboxA.right - 1 < macroBoxLeft ||
-+          bboxA.right - 1 < state.scissorInFixedPoint.left ||
-+          bboxA.top > macroBoxBottom ||
-+          bboxA.top > state.scissorInFixedPoint.bottom ||
-+          bboxA.bottom - 1 < macroBoxTop ||
-+          bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
-+        // rasterize triangle
-+        RasterizeTriangle<SWR_MULTISAMPLE_1X>(pDC, workerId, macroTile, (void*)&newWorkDesc);
-+    }
-+
-+    // triangle 1
-+    // v0,v1 -> v1,v1,v0
-+    vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
-+    vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
-+    vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
-+    vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
-+
-+    vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
-+    if (workDesc.triFlags.yMajor)
-+    {
-+        vXa = _mm_add_ps(vAdjust, vXa);
-+    }
-+    else
-+    {
-+        vYa = _mm_add_ps(vAdjust, vYa);
-+    }
-+
-+    // Store triangle description for rasterizer
-+    _mm_store_ps((float*)&newTriBuffer[0], vXa);
-+    _mm_store_ps((float*)&newTriBuffer[4], vYa);
-+    _mm_store_ps((float*)&newTriBuffer[8], vZa);
-+    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
-+
-+    // binner bins 3 edges for lines as v0, v1, v1
-+    // tri1 needs v1, v1, v0
-+    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
-+    {
-+        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
-+        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
-+
-+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
-+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
-+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
-+    }
-+
-+    // store user clip distance for triangle 1
-+    if (numClipDist)
-+    {
-+        float* pOldBuffer = workDesc.pUserClipBuffer;
-+        float* pNewBuffer = newClipBuffer;
-+        for (uint32_t i = 0; i < numClipDist; ++i)
-+        {
-+            // read barycentric coeffs from binner
-+            float a = *(pOldBuffer++);
-+            float b = *(pOldBuffer++);
-+
-+            // reconstruct original clip distance at vertices
-+            float c0 = a + b;
-+            float c1 = b;
-+
-+            // construct triangle barycentrics
-+            *(pNewBuffer++) = c1 - c0;
-+            *(pNewBuffer++) = c1 - c0;
-+            *(pNewBuffer++) = c0;
-+        }
-+    }
-+
-+    vXai = fpToFixedPoint(vXa);
-+    vYai = fpToFixedPoint(vYa);
-+    calcBoundingBoxInt(vXai, vYai, bboxA);
-+
-+    if (!(bboxA.left > macroBoxRight ||
-+          bboxA.left > state.scissorInFixedPoint.right ||
-+          bboxA.right - 1 < macroBoxLeft ||
-+          bboxA.right - 1 < state.scissorInFixedPoint.left ||
-+          bboxA.top > macroBoxBottom ||
-+          bboxA.top > state.scissorInFixedPoint.bottom ||
-+          bboxA.bottom - 1 < macroBoxTop ||
-+          bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
-+        // rasterize triangle
-+        RasterizeTriangle<SWR_MULTISAMPLE_1X>(pDC, workerId, macroTile, (void*)&newWorkDesc);
-+    }
-+
-+    RDTSC_STOP(BERasterizeLine, 1, 0);
-+}
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
-new file mode 100644
-index 0000000..e07d7ea
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
-@@ -0,0 +1,34 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file rasterizer.h
-+*
-+* @brief Definitions for the rasterizer.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "context.h"
-+
-+void rastPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-+extern PFN_WORK_FUNC gRasterizerTable[SWR_MULTISAMPLE_TYPE_MAX];
-+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
-new file mode 100644
-index 0000000..df96f72
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
-@@ -0,0 +1,90 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#include "rdtsc_core.h"
-+#include "common/rdtsc_buckets.h"
-+
-+// must match CORE_BUCKETS enum order
-+BUCKET_DESC gCoreBuckets[] = {
-+    { "APIClearRenderTarget", "", true, 0xff0b8bea },
-+    { "APIDraw", "", true, 0xff000066 },
-+    { "APIDrawWakeAllThreads", "", false, 0xffffffff },
-+    { "APIDrawIndexed", "", true, 0xff000066 },
-+    { "APIDispatch", "", true, 0xff660000 },
-+    { "APIStoreTiles", "", true, 0xff00ffff },
-+    { "APIGetDrawContext", "", false, 0xffffffff },
-+    { "APISync", "", true, 0xff6666ff },
-+    { "FEProcessDraw", "", true, 0xff009900 },
-+    { "FEProcessDrawIndexed", "", true, 0xff009900 },
-+    { "FEFetchShader", "", false, 0xffffffff },
-+    { "FEVertexShader", "", false, 0xffffffff },
-+    { "FEHullShader", "", false, 0xffffffff },
-+    { "FETessellation", "", false, 0xffffffff },
-+    { "FEDomainShader", "", false, 0xffffffff },
-+    { "FEGeometryShader", "", false, 0xffffffff },
-+    { "FEStreamout", "", false, 0xffffffff },
-+    { "FEPAAssemble", "", false, 0xffffffff },
-+    { "FEBinPoints", "", false, 0xff29b854 },
-+    { "FEBinLines", "", false, 0xff29b854 },
-+    { "FEBinTriangles", "", false, 0xff29b854 },
-+    { "FETriangleSetup", "", false, 0xffffffff },
-+    { "FEViewportCull", "", false, 0xffffffff },
-+    { "FEGuardbandClip", "", false, 0xffffffff },
-+    { "FEClipPoints", "", false, 0xffffffff },
-+    { "FEClipLines", "", false, 0xffffffff },
-+    { "FEClipTriangles", "", false, 0xffffffff },
-+    { "FECullZeroAreaAndBackface", "", false, 0xffffffff },
-+    { "FECullBetweenCenters", "", false, 0xffffffff },
-+    { "FEProcessStoreTiles", "", true, 0xff39c864 },
-+    { "FEProcessInvalidateTiles", "", true, 0xffffffff },
-+    { "WorkerWorkOnFifoBE", "", false, 0xff40261c },
-+    { "WorkerFoundWork", "", false, 0xff573326 },
-+    { "BELoadTiles", "", true, 0xffb0e2ff },
-+    { "BEDispatch", "", true, 0xff00a2ff },
-+    { "BEClear", "", true, 0xff00ccbb },
-+    { "BERasterizeLine", "", true, 0xffb26a4e },
-+    { "BERasterizeTriangle", "", true, 0xffb26a4e },
-+    { "BETriangleSetup", "", false, 0xffffffff },
-+    { "BEStepSetup", "", false, 0xffffffff },
-+    { "BECullZeroArea", "", false, 0xffffffff },
-+    { "BEEmptyTriangle", "", false, 0xffffffff },
-+    { "BETrivialAccept", "", false, 0xffffffff },
-+    { "BETrivialReject", "", false, 0xffffffff },
-+    { "BERasterizePartial", "", false, 0xffffffff },
-+    { "BEPixelBackend", "", false, 0xffffffff },
-+    { "BESetup", "", false, 0xffffffff },
-+    { "BEBarycentric", "", false, 0xffffffff },
-+    { "BEEarlyDepthTest", "", false, 0xffffffff },
-+    { "BEPixelShader", "", false, 0xffffffff },
-+    { "BELateDepthTest", "", false, 0xffffffff },
-+    { "BEOutputMerger", "", false, 0xffffffff },
-+    { "BEStoreTiles", "", true, 0xff00cccc },
-+    { "BEEndTile", "", false, 0xffffffff },
-+    { "WorkerWaitForThreadEvent", "", false, 0xffffffff },
-+};
-+
-+/// @todo bucketmanager and mapping should probably be a part of the SWR context
-+std::vector<uint32_t> gBucketMap;
-+BucketManager gBucketMgr(false);
-+
-+uint32_t gCurrentFrame = 0;
-diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
-new file mode 100644
-index 0000000..1e3700d
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
-@@ -0,0 +1,175 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#pragma once
-+#include "knobs.h"
-+
-+#include "common/os.h"
-+#include "common/rdtsc_buckets.h"
-+
-+#include <vector>
-+
-+enum CORE_BUCKETS
-+{
-+    APIClearRenderTarget,
-+    APIDraw,
-+    APIDrawWakeAllThreads,
-+    APIDrawIndexed,
-+    APIDispatch,
-+    APIStoreTiles,
-+    APIGetDrawContext,
-+    APISync,
-+    FEProcessDraw,
-+    FEProcessDrawIndexed,
-+    FEFetchShader,
-+    FEVertexShader,
-+    FEHullShader,
-+    FETessellation,
-+    FEDomainShader,
-+    FEGeometryShader,
-+    FEStreamout,
-+    FEPAAssemble,
-+    FEBinPoints,
-+    FEBinLines,
-+    FEBinTriangles,
-+    FETriangleSetup,
-+    FEViewportCull,
-+    FEGuardbandClip,
-+    FEClipPoints,
-+    FEClipLines,
-+    FEClipTriangles,
-+    FECullZeroAreaAndBackface,
-+    FECullBetweenCenters,
-+    FEProcessStoreTiles,
-+    FEProcessInvalidateTiles,
-+    WorkerWorkOnFifoBE,
-+    WorkerFoundWork,
-+    BELoadTiles,
-+    BEDispatch,
-+    BEClear,
-+    BERasterizeLine,
-+    BERasterizeTriangle,
-+    BETriangleSetup,
-+    BEStepSetup,
-+    BECullZeroArea,
-+    BEEmptyTriangle,
-+    BETrivialAccept,
-+    BETrivialReject,
-+    BERasterizePartial,
-+    BEPixelBackend,
-+    BESetup,
-+    BEBarycentric,
-+    BEEarlyDepthTest,
-+    BEPixelShader,
-+    BELateDepthTest,
-+    BEOutputMerger,
-+    BEStoreTiles,
-+    BEEndTile,
-+    WorkerWaitForThreadEvent,
-+
-+    NumBuckets
-+};
-+
-+void rdtscReset();
-+void rdtscInit(int threadId);
-+void rdtscStart(uint32_t bucketId);
-+void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId);
-+void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2);
-+void rdtscEndFrame();
-+
-+#ifdef KNOB_ENABLE_RDTSC
-+#define RDTSC_RESET() rdtscReset()
-+#define RDTSC_INIT(threadId) rdtscInit(threadId)
-+#define RDTSC_START(bucket) rdtscStart(bucket)
-+#define RDTSC_STOP(bucket, count, draw) rdtscStop(bucket, count, draw)
-+#define RDTSC_EVENT(bucket, count1, count2) rdtscEvent(bucket, count1, count2)
-+#define RDTSC_ENDFRAME() rdtscEndFrame()
-+#else
-+#define RDTSC_RESET()
-+#define RDTSC_INIT(threadId)
-+#define RDTSC_START(bucket)
-+#define RDTSC_STOP(bucket, count, draw)
-+#define RDTSC_EVENT(bucket, count1, count2)
-+#define RDTSC_ENDFRAME()
-+#endif
-+
-+extern std::vector<uint32_t> gBucketMap;
-+extern BucketManager gBucketMgr;
-+extern BUCKET_DESC gCoreBuckets[];
-+extern uint32_t gCurrentFrame;
-+
-+INLINE void rdtscReset()
-+{
-+    gCurrentFrame = 0;
-+    gBucketMgr.ClearThreads();
-+    gBucketMgr.ClearBuckets();
-+}
-+
-+INLINE void rdtscInit(int threadId)
-+{
-+    // register all the buckets once
-+    if (threadId == 0)
-+    {
-+        gBucketMap.resize(NumBuckets);
-+        for (uint32_t i = 0; i < NumBuckets; ++i)
-+        {
-+            gBucketMap[i] = gBucketMgr.RegisterBucket(gCoreBuckets[i]);
-+        }
-+    }
-+
-+    std::string name = threadId == 0 ? "API" : "WORKER";
-+    gBucketMgr.RegisterThread(name);
-+}
-+
-+INLINE void rdtscStart(uint32_t bucketId)
-+{
-+    uint32_t id = gBucketMap[bucketId];
-+    gBucketMgr.StartBucket(id);
-+}
-+
-+INLINE void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId)
-+{
-+    uint32_t id = gBucketMap[bucketId];
-+    gBucketMgr.StopBucket(id);
-+}
-+
-+INLINE void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2)
-+{
-+
-+}
-+
-+INLINE void rdtscEndFrame()
-+{
-+    gCurrentFrame++;
-+
-+    if (gCurrentFrame == KNOB_BUCKETS_START_FRAME)
-+    {
-+        gBucketMgr.StartCapture();
-+    }
-+
-+    if (gCurrentFrame == KNOB_BUCKETS_END_FRAME)
-+    {
-+        gBucketMgr.StopCapture();
-+        gBucketMgr.PrintReport("rdtsc.txt");
-+    }
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
-new file mode 100644
-index 0000000..ad8b91fc
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
-@@ -0,0 +1,918 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file state.h
-+*
-+* @brief Definitions for API state.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/formats.h"
-+#include "common/simdintrin.h"
-+
-+// clear flags
-+#define SWR_CLEAR_NONE        0
-+#define SWR_CLEAR_COLOR      (1 << 0)
-+#define SWR_CLEAR_DEPTH      (1 << 1)
-+#define SWR_CLEAR_STENCIL    (1 << 2)
-+
-+enum DRIVER_TYPE
-+{
-+    DX,
-+    GL
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// PRIMITIVE_TOPOLOGY.
-+//////////////////////////////////////////////////////////////////////////
-+enum PRIMITIVE_TOPOLOGY
-+{
-+    TOP_UNKNOWN = 0x0,
-+    TOP_POINT_LIST = 0x1,
-+    TOP_LINE_LIST = 0x2,
-+    TOP_LINE_STRIP = 0x3,
-+    TOP_TRIANGLE_LIST = 0x4,
-+    TOP_TRIANGLE_STRIP = 0x5,
-+    TOP_TRIANGLE_FAN = 0x6,
-+    TOP_QUAD_LIST = 0x7,
-+    TOP_QUAD_STRIP = 0x8,
-+    TOP_LINE_LIST_ADJ = 0x9,
-+    TOP_LISTSTRIP_ADJ = 0xA,
-+    TOP_TRI_LIST_ADJ = 0xB,
-+    TOP_TRI_STRIP_ADJ = 0xC,
-+    TOP_TRI_STRIP_REVERSE = 0xD,
-+    TOP_POLYGON = 0xE,
-+    TOP_RECT_LIST = 0xF,
-+    TOP_LINE_LOOP = 0x10,
-+    TOP_POINT_LIST_BF = 0x11,
-+    TOP_LINE_STRIP_CONT = 0x12,
-+    TOP_LINE_STRIP_BF = 0x13,
-+    TOP_LINE_STRIP_CONT_BF = 0x14,
-+    TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16,
-+    TOP_TRIANGLE_DISC = 0x17,   /// @todo What is this??
-+
-+    TOP_PATCHLIST_BASE = 0x1F,  // Invalid topology, used to calculate num verts for a patchlist.
-+    TOP_PATCHLIST_1 = 0x20,     // List of 1-vertex patches
-+    TOP_PATCHLIST_2 = 0x21,
-+    TOP_PATCHLIST_3 = 0x22,
-+    TOP_PATCHLIST_4 = 0x23,
-+    TOP_PATCHLIST_5 = 0x24,
-+    TOP_PATCHLIST_6 = 0x25,
-+    TOP_PATCHLIST_7 = 0x26,
-+    TOP_PATCHLIST_8 = 0x27,
-+    TOP_PATCHLIST_9 = 0x28,
-+    TOP_PATCHLIST_10 = 0x29,
-+    TOP_PATCHLIST_11 = 0x2A,
-+    TOP_PATCHLIST_12 = 0x2B,
-+    TOP_PATCHLIST_13 = 0x2C,
-+    TOP_PATCHLIST_14 = 0x2D,
-+    TOP_PATCHLIST_15 = 0x2E,
-+    TOP_PATCHLIST_16 = 0x2F,
-+    TOP_PATCHLIST_17 = 0x30,
-+    TOP_PATCHLIST_18 = 0x31,
-+    TOP_PATCHLIST_19 = 0x32,
-+    TOP_PATCHLIST_20 = 0x33,
-+    TOP_PATCHLIST_21 = 0x34,
-+    TOP_PATCHLIST_22 = 0x35,
-+    TOP_PATCHLIST_23 = 0x36,
-+    TOP_PATCHLIST_24 = 0x37,
-+    TOP_PATCHLIST_25 = 0x38,
-+    TOP_PATCHLIST_26 = 0x39,
-+    TOP_PATCHLIST_27 = 0x3A,
-+    TOP_PATCHLIST_28 = 0x3B,
-+    TOP_PATCHLIST_29 = 0x3C,
-+    TOP_PATCHLIST_30 = 0x3D,
-+    TOP_PATCHLIST_31 = 0x3E,
-+    TOP_PATCHLIST_32 = 0x3F,   // List of 32-vertex patches
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_SHADER_TYPE
-+//////////////////////////////////////////////////////////////////////////
-+enum SWR_SHADER_TYPE
-+{
-+    SHADER_VERTEX,
-+    SHADER_GEOMETRY,
-+    SHADER_DOMAIN,
-+    SHADER_HULL,
-+    SHADER_PIXEL,
-+    SHADER_COMPUTE,
-+
-+    NUM_SHADER_TYPES,
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_RENDERTARGET_ATTACHMENT
-+/// @todo Its not clear what an "attachment" means. Its not common term.
-+//////////////////////////////////////////////////////////////////////////
-+enum SWR_RENDERTARGET_ATTACHMENT
-+{
-+    SWR_ATTACHMENT_COLOR0,
-+    SWR_ATTACHMENT_COLOR1,
-+    SWR_ATTACHMENT_COLOR2,
-+    SWR_ATTACHMENT_COLOR3,
-+    SWR_ATTACHMENT_COLOR4,
-+    SWR_ATTACHMENT_COLOR5,
-+    SWR_ATTACHMENT_COLOR6,
-+    SWR_ATTACHMENT_COLOR7,
-+    SWR_ATTACHMENT_DEPTH,
-+    SWR_ATTACHMENT_STENCIL,
-+
-+    SWR_NUM_ATTACHMENTS
-+};
-+
-+#define SWR_NUM_RENDERTARGETS 8
-+
-+#define SWR_ATTACHMENT_COLOR0_BIT 0x001
-+#define SWR_ATTACHMENT_COLOR1_BIT 0x002
-+#define SWR_ATTACHMENT_COLOR2_BIT 0x004
-+#define SWR_ATTACHMENT_COLOR3_BIT 0x008
-+#define SWR_ATTACHMENT_COLOR4_BIT 0x010
-+#define SWR_ATTACHMENT_COLOR5_BIT 0x020
-+#define SWR_ATTACHMENT_COLOR6_BIT 0x040
-+#define SWR_ATTACHMENT_COLOR7_BIT 0x080
-+#define SWR_ATTACHMENT_DEPTH_BIT 0x100
-+#define SWR_ATTACHMENT_STENCIL_BIT 0x200
-+#define SWR_ATTACHMENT_MASK_ALL 0x3ff
-+#define SWR_ATTACHMENT_MASK_COLOR 0x0ff
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SWR Inner Tessellation factor ID
-+/// See above GetTessFactorOutputPosition code for documentation
-+enum SWR_INNER_TESSFACTOR_ID
-+{
-+    SWR_QUAD_U_TRI_INSIDE,
-+    SWR_QUAD_V_INSIDE,
-+
-+    SWR_NUM_INNER_TESS_FACTORS,
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief SWR Outer Tessellation factor ID
-+/// See above GetTessFactorOutputPosition code for documentation
-+enum SWR_OUTER_TESSFACTOR_ID
-+{
-+    SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL,
-+    SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY,
-+    SWR_QUAD_U_EQ1_TRI_W,
-+    SWR_QUAD_V_EQ1,
-+
-+    SWR_NUM_OUTER_TESS_FACTORS,
-+};
-+
-+
-+/////////////////////////////////////////////////////////////////////////
-+/// simdvertex
-+/// @brief Defines a vertex element that holds all the data for SIMD vertices.
-+///        Contains position in clip space, hardcoded to attribute 0,
-+///        space for up to 32 attributes, as well as any SGV values generated
-+///        by the pipeline (to be implemented)
-+/////////////////////////////////////////////////////////////////////////
-+#define VERTEX_POSITION_SLOT 0
-+#define VERTEX_ATTRIB_START_SLOT 1
-+#define VERTEX_ATTRIB_END_SLOT 32
-+#define VERTEX_RTAI_SLOT 33         // GS will write RenderTargetArrayIndex here
-+#define VERTEX_PRIMID_SLOT 34       // GS will write PrimId here
-+#define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS will write lower 4 clip/cull dist
-+#define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS will write upper 4 clip/cull dist
-+static_assert(VERTEX_CLIPCULL_DIST_HI_SLOT < KNOB_NUM_ATTRIBUTES, "Mismatched attribute slot size");
-+
-+// SoAoSoA
-+struct simdvertex
-+{
-+    simdvector    attrib[KNOB_NUM_ATTRIBUTES];
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_VS_CONTEXT
-+/// @brief Input to vertex shader
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_VS_CONTEXT
-+{
-+    simdvertex* pVin;           // IN: SIMD input vertex data store
-+    simdvertex* pVout;          // OUT: SIMD output vertex data store
-+
-+    uint32_t InstanceID;    // IN: Instance ID, constant across all verts of the SIMD
-+    simdscalari VertexID;   // IN: Vertex ID
-+    simdscalari mask;       // IN: Active mask for shader
-+};
-+
-+/////////////////////////////////////////////////////////////////////////
-+/// ScalarCPoint
-+/// @brief defines a control point element as passed from the output
-+/// of the hull shader to the input of the domain shader
-+/////////////////////////////////////////////////////////////////////////
-+struct ScalarAttrib
-+{
-+    float x;
-+    float y;
-+    float z;
-+    float w;
-+};
-+
-+struct ScalarCPoint
-+{
-+    ScalarAttrib attrib[KNOB_NUM_ATTRIBUTES];
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_TESSELLATION_FACTORS
-+/// @brief Tessellation factors structure (non-vector)
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_TESSELLATION_FACTORS
-+{
-+    float  OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
-+    float  InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
-+};
-+
-+#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches
-+struct ScalarPatch
-+{
-+    SWR_TESSELLATION_FACTORS tessFactors;
-+    ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM];
-+    ScalarCPoint patchData;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_HS_CONTEXT
-+/// @brief Input to hull shader
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_HS_CONTEXT
-+{
-+    simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
-+    simdscalari PrimitiveID;    // IN: (SIMD) primitive ID generated from the draw call
-+    ScalarPatch* pCPout;        // OUT: Output control point patch
-+                                // SIMD-sized-array of SCALAR patches
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_DS_CONTEXT
-+/// @brief Input to domain shader
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_DS_CONTEXT
-+{
-+    uint32_t        PrimitiveID;    // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation
-+    uint32_t        vectorOffset;   // IN: (SCALAR) vector index offset into SIMD data.
-+    uint32_t        vectorStride;   // IN: (SCALAR) stride (in vectors) of output data per attribute-component
-+    ScalarPatch*    pCpIn;          // IN: (SCALAR) Control patch
-+    simdscalar*     pDomainU;       // IN: (SIMD) Domain Point U coords
-+    simdscalar*     pDomainV;       // IN: (SIMD) Domain Point V coords
-+    simdscalar*     pOutputData;    // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component)
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_GS_CONTEXT
-+/// @brief Input to geometry shader.
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_GS_CONTEXT
-+{
-+    simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims
-+    simdscalari PrimitiveID;    // IN: input primitive ID generated from the draw call
-+    uint32_t InstanceID;        // IN: input instance ID
-+    uint8_t* pStream[4];        // OUT: output streams
-+    uint8_t* pCutBuffer;        // OUT: cut buffer
-+    simdscalari vertexCount;    // OUT: num vertices emitted per SIMD lane
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_PS_CONTEXT
-+/// @brief Input to pixel shader.
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_PS_CONTEXT
-+{
-+    simdscalar vX;        // IN: x location of pixels
-+    simdscalar vY;        // IN: y location of pixels
-+    simdscalar vZ;        // INOUT: z location of pixels
-+    simdscalari mask;     // INOUT: mask for kill
-+
-+    // rasterizer generated barycentric components
-+    simdscalar vI;        // IN: Barycentric I component
-+    simdscalar vJ;        // IN: Barycentric J component
-+    simdscalar vOneOverW; // IN: 1/w
-+
-+    const float* pAttribs;      // IN: pointer to attribute barycentric coefficients
-+    const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients
-+    const float *I;             // IN: Barycentric A, B, and C coefs used to compute I
-+    const float *J;             // IN: Barycentric A, B, and C coefs used to compute J
-+    float recipDet;             // IN: 1/Det, used when barycentric interpolating attributes
-+    const float* pSamplePos;    // IN: array of sample positions
-+    simdvector shaded[SWR_NUM_RENDERTARGETS];       // OUT: result color per rendertarget
-+
-+    uint32_t frontFace;   // IN: front- 1, back- 0
-+    uint32_t primID;      // IN: primitive ID
-+    uint32_t sampleIndex; // IN: sampleIndex
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_CS_CONTEXT
-+/// @brief Input to compute shader.
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_CS_CONTEXT
-+{
-+    // The ThreadGroupId is the current thread group index relative
-+    // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup,
-+    // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader.
-+
-+    // Compute shader accepts the following system values.
-+    // o ThreadId - Current thread id relative to all other threads in dispatch.
-+    // o ThreadGroupId - Current thread group id relative to all other groups in dispatch.
-+    // o ThreadIdInGroup - Current thread relative to all threads in the current thread group.
-+    // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup.
-+    //
-+    // All of these system values can be computed in the shader. They will be
-+    // derived from the current tile counter. The tile counter is an atomic counter that
-+    // resides in the draw context and is initialized to the product of the dispatch dims.
-+    //
-+    //  tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z
-+    //
-+    // Each CPU worker thread will atomically decrement this counter and passes the current
-+    // count into the shader. When the count reaches 0 then all thread groups in the
-+    // dispatch call have been completed.
-+
-+    uint32_t tileCounter;  // The tile counter value for this thread group.
-+
-+    // Dispatch dimensions used by shader to compute system values from the tile counter.
-+    uint32_t dispatchDims[3];
-+
-+    uint8_t* pTGSM;  // Thread Group Shared Memory pointer.
-+};
-+
-+// enums
-+enum SWR_TILE_MODE
-+{
-+    SWR_TILE_NONE = 0x0,    // Linear mode (no tiling)
-+    SWR_TILE_MODE_WMAJOR,   // W major tiling
-+    SWR_TILE_MODE_XMAJOR,   // X major tiling
-+    SWR_TILE_MODE_YMAJOR,   // Y major tiling
-+    SWR_TILE_SWRZ,          // SWR-Z tiling
-+
-+    SWR_TILE_MODE_COUNT
-+};
-+
-+enum SWR_SURFACE_TYPE
-+{
-+    SURFACE_1D        = 0,
-+    SURFACE_2D        = 1,
-+    SURFACE_3D        = 2,
-+    SURFACE_CUBE      = 3,
-+    SURFACE_BUFFER    = 4,
-+    SURFACE_STRUCTURED_BUFFER = 5,
-+    SURFACE_NULL       = 7
-+};
-+
-+enum SWR_ZFUNCTION
-+{
-+    ZFUNC_ALWAYS,
-+    ZFUNC_NEVER,
-+    ZFUNC_LT,
-+    ZFUNC_EQ,
-+    ZFUNC_LE,
-+    ZFUNC_GT,
-+    ZFUNC_NE,
-+    ZFUNC_GE,
-+    NUM_ZFUNC
-+};
-+
-+enum SWR_STENCILOP
-+{
-+    STENCILOP_KEEP,
-+    STENCILOP_ZERO,
-+    STENCILOP_REPLACE,
-+    STENCILOP_INCRSAT,
-+    STENCILOP_DECRSAT,
-+    STENCILOP_INCR,
-+    STENCILOP_DECR,
-+    STENCILOP_INVERT
-+};
-+
-+enum SWR_BLEND_FACTOR
-+{
-+    BLENDFACTOR_ONE,
-+    BLENDFACTOR_SRC_COLOR,
-+    BLENDFACTOR_SRC_ALPHA,
-+    BLENDFACTOR_DST_ALPHA,
-+    BLENDFACTOR_DST_COLOR,
-+    BLENDFACTOR_SRC_ALPHA_SATURATE,
-+    BLENDFACTOR_CONST_COLOR,
-+    BLENDFACTOR_CONST_ALPHA,
-+    BLENDFACTOR_SRC1_COLOR,
-+    BLENDFACTOR_SRC1_ALPHA,
-+    BLENDFACTOR_ZERO,
-+    BLENDFACTOR_INV_SRC_COLOR,
-+    BLENDFACTOR_INV_SRC_ALPHA,
-+    BLENDFACTOR_INV_DST_ALPHA,
-+    BLENDFACTOR_INV_DST_COLOR,
-+    BLENDFACTOR_INV_CONST_COLOR,
-+    BLENDFACTOR_INV_CONST_ALPHA,
-+    BLENDFACTOR_INV_SRC1_COLOR,
-+    BLENDFACTOR_INV_SRC1_ALPHA
-+};
-+
-+enum SWR_BLEND_OP
-+{
-+    BLENDOP_ADD,
-+    BLENDOP_SUBTRACT,
-+    BLENDOP_REVSUBTRACT,
-+    BLENDOP_MIN,
-+    BLENDOP_MAX,
-+};
-+
-+struct SWR_SURFACE_STATE
-+{
-+    uint8_t *pBaseAddress;
-+    SWR_SURFACE_TYPE type;  // @llvm_enum
-+    SWR_FORMAT format;      // @llvm_enum
-+    uint32_t width;
-+    uint32_t height;
-+    uint32_t depth;
-+    uint32_t numSamples;
-+    uint32_t pitch;
-+    uint32_t qpitch;
-+    uint32_t minLod;            // for sampled surfaces, the most detailed LOD that can be accessed by sampler
-+    uint32_t maxLod;            // for sampled surfaces, the max LOD that can be accessed
-+    float resourceMinLod;   // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler
-+    uint32_t lod;               // for render targets, the lod being rendered to
-+    uint32_t arrayIndex;        // for render targets, the array index being rendered to for arrayed surfaces
-+    SWR_TILE_MODE tileMode; // @llvm_enum
-+    uint32_t halign;
-+    uint32_t valign;
-+
-+    uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces
-+
-+    uint8_t *pAuxBaseAddress;  // Used for compression, append/consume counter, etc.
-+};
-+
-+// vertex fetch state
-+// WARNING- any changes to this struct need to be reflected
-+// in the fetch shader jit
-+struct SWR_VERTEX_BUFFER_STATE
-+{
-+    uint32_t index;
-+    uint32_t pitch;
-+    const uint8_t *pData;
-+    uint32_t size;
-+    uint32_t numaNode;
-+    uint32_t maxVertex;             // size / pitch.  precalculated value used by fetch shader for OOB checks
-+    uint32_t partialInboundsSize;   // size % pitch.  precalculated value used by fetch shader for partially OOB vertices
-+};
-+
-+struct SWR_INDEX_BUFFER_STATE
-+{
-+    // Format type for indices (e.g. UINT16, UINT32, etc.)
-+    SWR_FORMAT format; // @llvm_enum
-+    const void *pIndices;
-+    uint32_t size;
-+};
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_FETCH_CONTEXT
-+/// @brief Input to fetch shader.
-+/// @note WARNING - Changes to this struct need to be reflected in the
-+///                 fetch shader jit.
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_FETCH_CONTEXT
-+{
-+    const SWR_VERTEX_BUFFER_STATE* pStreams;    // IN: array of bound vertex buffers
-+    const int32_t* pIndices;                    // IN: pointer to index buffer for indexed draws
-+    const int32_t* pLastIndex;                  // IN: pointer to end of index buffer, used for bounds checking
-+    uint32_t CurInstance;                       // IN: current instance
-+    uint32_t BaseVertex;                        // IN: base vertex
-+    uint32_t StartVertex;                       // IN: start vertex
-+    uint32_t StartInstance;                     // IN: start instance
-+    simdscalari VertexID;                       // OUT: vector of vertex IDs
-+    simdscalari CutMask;                        // OUT: vector mask of indices which have the cut index value
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_STATS
-+///
-+/// @brief All statistics generated by SWR go here. These are public
-+///        to driver.
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_STATS
-+{
-+    // Occlusion Query
-+    uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
-+
-+    // Pipeline Stats
-+    uint64_t IaVertices;    // Number of Fetch Shader vertices
-+    uint64_t IaPrimitives;  // Number of PA primitives.
-+    uint64_t VsInvocations; // Number of Vertex Shader invocations
-+    uint64_t HsInvocations; // Number of Hull Shader invocations
-+    uint64_t DsInvocations; // Number of Domain Shader invocations
-+    uint64_t GsInvocations; // Number of Geometry Shader invocations
-+    uint64_t PsInvocations; // Number of Pixel Shader invocations
-+    uint64_t CsInvocations; // Number of Compute Shader invocations
-+    uint64_t CInvocations;  // Number of clipper invocations
-+    uint64_t CPrimitives;   // Number of clipper primitives.
-+    uint64_t GsPrimitives;  // Number of prims GS outputs.
-+
-+    // Streamout Stats
-+    uint32_t SoWriteOffset[4];
-+    uint64_t SoPrimStorageNeeded[4];
-+    uint64_t SoNumPrimsWritten[4];
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// STREAMOUT_BUFFERS
-+/////////////////////////////////////////////////////////////////////////
-+
-+#define MAX_SO_STREAMS 4
-+#define MAX_ATTRIBUTES 32
-+
-+struct SWR_STREAMOUT_BUFFER
-+{
-+    bool enable;
-+
-+    // Pointers to streamout buffers.
-+    uint32_t* pBuffer;
-+
-+    // Size of buffer in dwords.
-+    uint32_t bufferSize;
-+
-+    // Vertex pitch of buffer in dwords.
-+    uint32_t pitch;
-+
-+    // Offset into buffer in dwords. SOS will increment this offset.
-+    uint32_t streamOffset;
-+
-+    // Offset to the SO write offset. If not null then we update offset here.
-+    uint32_t* pWriteOffset;
-+
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// STREAMOUT_STATE
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_STREAMOUT_STATE
-+{
-+    // This disables stream output.
-+    bool soEnable;
-+
-+    // which streams are enabled for streamout
-+    bool streamEnable[MAX_SO_STREAMS];
-+
-+    // If set then do not send any streams to the rasterizer.
-+    bool rasterizerDisable;
-+
-+    // Specifies which stream to send to the rasterizer.
-+    uint32_t streamToRasterizer;
-+
-+    // The stream masks specify which attributes are sent to which streams.
-+    // These masks help the FE to setup the pPrimData buffer that is passed
-+    // the the Stream Output Shader (SOS) function.
-+    uint32_t streamMasks[MAX_SO_STREAMS];
-+
-+    // Number of attributes, including position, per vertex that are streamed out.
-+    // This should match number of bits in stream mask.
-+    uint32_t streamNumEntries[MAX_SO_STREAMS];
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// STREAMOUT_CONTEXT - Passed to SOS
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_STREAMOUT_CONTEXT
-+{
-+    uint32_t* pPrimData;
-+    SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS];
-+
-+    // Num prims written for this stream
-+    uint32_t numPrimsWritten;
-+
-+    // Num prims that should have been written if there were no overflow.
-+    uint32_t numPrimStorageNeeded;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_GS_STATE - Geometry shader state
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_GS_STATE
-+{
-+    bool gsEnable;
-+
-+    // number of input attributes per vertex. used by the frontend to
-+    // optimize assembling primitives for GS
-+    uint32_t numInputAttribs;
-+
-+    // output topology - can be point, tristrip, or linestrip
-+    PRIMITIVE_TOPOLOGY outputTopology;      // @llvm_enum
-+
-+    // maximum number of verts that can be emitted by a single instance of the GS
-+    uint32_t maxNumVerts;
-+    
-+    // instance count
-+    uint32_t instanceCount;
-+
-+    // geometry shader emits renderTargetArrayIndex
-+    bool emitsRenderTargetArrayIndex;
-+
-+    // geometry shader emits PrimitiveID
-+    bool emitsPrimitiveID;
-+};
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS
-+/////////////////////////////////////////////////////////////////////////
-+enum SWR_TS_OUTPUT_TOPOLOGY
-+{
-+    SWR_TS_OUTPUT_POINT,
-+    SWR_TS_OUTPUT_LINE,
-+    SWR_TS_OUTPUT_TRI_CW,
-+    SWR_TS_OUTPUT_TRI_CCW,
-+
-+    SWR_TS_OUTPUT_TOPOLOGY_COUNT
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_TS_PARTITIONING - Defines tessellation algorithm
-+/////////////////////////////////////////////////////////////////////////
-+enum SWR_TS_PARTITIONING
-+{
-+    SWR_TS_INTEGER,
-+    SWR_TS_ODD_FRACTIONAL,
-+    SWR_TS_EVEN_FRACTIONAL,
-+
-+    SWR_TS_PARTITIONING_COUNT
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_TS_DOMAIN - Defines Tessellation Domain
-+/////////////////////////////////////////////////////////////////////////
-+enum SWR_TS_DOMAIN
-+{
-+    SWR_TS_QUAD,
-+    SWR_TS_TRI,
-+    SWR_TS_ISOLINE,
-+
-+    SWR_TS_DOMAIN_COUNT
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_TS_STATE - Tessellation state
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_TS_STATE
-+{
-+    bool                    tsEnable;
-+    SWR_TS_OUTPUT_TOPOLOGY  tsOutputTopology;   // @llvm_enum
-+    SWR_TS_PARTITIONING     partitioning;       // @llvm_enum
-+    SWR_TS_DOMAIN           domain;             // @llvm_enum
-+
-+    PRIMITIVE_TOPOLOGY      postDSTopology;     // @llvm_enum
-+
-+    uint32_t                numHsInputAttribs;
-+    uint32_t                numHsOutputAttribs;
-+    uint32_t                numDsOutputAttribs;
-+};
-+
-+// output merger state
-+struct SWR_RENDER_TARGET_BLEND_STATE
-+{
-+    uint32_t colorBlendEnable : 1;
-+    uint32_t sourceAlphaBlendFactor : 5;
-+    uint32_t destAlphaBlendFactor : 5;
-+    uint32_t sourceBlendFactor : 5;
-+    uint32_t destBlendFactor : 5;
-+    uint32_t colorBlendFunc : 3;
-+    uint32_t alphaBlendFunc : 3;
-+
-+    uint32_t writeDisableRed : 1;
-+    uint32_t writeDisableGreen : 1;
-+    uint32_t writeDisableBlue : 1;
-+    uint32_t writeDisableAlpha : 1;
-+};
-+static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 4, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
-+
-+struct SWR_BLEND_STATE
-+{
-+    float constantColor[4]; // constant blend factor color in RGBA float
-+    bool independentAlphaBlendEnable;
-+    SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS];
-+};
-+static_assert(sizeof(SWR_BLEND_STATE) == 52, "Invalid SWR_BLEND_STATE size");
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FUNCTION POINTERS FOR SHADERS
-+
-+typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
-+typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext);
-+typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext);
-+typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext);
-+typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsContext);
-+typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
-+typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
-+typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
-+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, BYTE*, simdvector&);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// FRONTEND_STATE
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_FRONTEND_STATE
-+{
-+    // skip clip test, perspective divide, and viewport transform
-+    // intended for verts in screen space
-+    bool vpTransformDisable;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// VIEWPORT_MATRIX
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_VIEWPORT_MATRIX
-+{
-+    float m00;
-+    float m11;
-+    float m22;
-+    float m30;
-+    float m31;
-+    float m32;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_VIEWPORT
-+/////////////////////////////////////////////////////////////////////////
-+struct SWR_VIEWPORT
-+{
-+    float x;
-+    float y;
-+    float width;
-+    float height;
-+    float minZ;
-+    float maxZ;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_CULLMODE
-+//////////////////////////////////////////////////////////////////////////
-+enum SWR_CULLMODE
-+{
-+    SWR_CULLMODE_BOTH,
-+    SWR_CULLMODE_NONE,
-+    SWR_CULLMODE_FRONT,
-+    SWR_CULLMODE_BACK
-+};
-+
-+enum SWR_FILLMODE
-+{
-+    SWR_FILLMODE_POINT,
-+    SWR_FILLMODE_WIREFRAME,
-+    SWR_FILLMODE_SOLID
-+};
-+
-+enum SWR_FRONTWINDING
-+{
-+    SWR_FRONTWINDING_CW,
-+    SWR_FRONTWINDING_CCW
-+};
-+
-+#define SWR_MAX_NUM_MULTISAMPLES 16
-+enum SWR_MULTISAMPLE_COUNT
-+{
-+    SWR_MULTISAMPLE_1X,
-+    SWR_MULTISAMPLE_2X,
-+    SWR_MULTISAMPLE_4X,
-+    SWR_MULTISAMPLE_8X,
-+    SWR_MULTISAMPLE_16X,
-+    SWR_MULTISAMPLE_TYPE_MAX
-+};
-+
-+enum SWR_PIXEL_LOCATION
-+{
-+    SWR_PIXEL_LOCATION_CENTER,
-+    SWR_PIXEL_LOCATION_UL,
-+};
-+
-+// fixed point screen space sample locations within a pixel
-+struct SWR_MULTISAMPLE_POS
-+{
-+    uint32_t x;
-+    uint32_t y;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SWR_RASTSTATE
-+//////////////////////////////////////////////////////////////////////////
-+struct SWR_RASTSTATE
-+{
-+    uint32_t cullMode : 2;
-+    uint32_t fillMode : 2;
-+    uint32_t frontWinding : 1;
-+    uint32_t scissorEnable : 1;
-+    uint32_t depthClipEnable : 1;
-+    float pointSize;
-+    float lineWidth;
-+
-+    // point size output from the VS
-+    bool pointParam;
-+    uint32_t pointSizeAttrib;
-+
-+    // point sprite
-+    bool pointSpriteEnable;
-+    bool pointSpriteTopOrigin;
-+    uint32_t pointSpriteFESlot;
-+
-+    // depth bias
-+    float depthBias;
-+    float slopeScaledDepthBias;
-+    float depthBiasClamp;
-+    SWR_FORMAT depthFormat;     // @llvm_enum
-+
-+    // multisample state
-+    SWR_MULTISAMPLE_COUNT sampleCount;  // @llvm_enum
-+    SWR_MULTISAMPLE_COUNT forcedSampleCount;  // @llvm_enum
-+    uint32_t pixelLocation;     // UL or Center
-+    uint32_t sampleMask;
-+    uint8_t  isSampleMasked[SWR_MAX_NUM_MULTISAMPLES];   
-+    bool pixelOffset;           // offset pixel positions by .5 in both the horizontal and vertical direction
-+    SWR_MULTISAMPLE_POS iSamplePos[SWR_MAX_NUM_MULTISAMPLES];   
-+
-+    // user clip/cull distance enables
-+    uint8_t cullDistanceMask;
-+    uint8_t clipDistanceMask;
-+};
-+
-+// backend state
-+struct SWR_BACKEND_STATE
-+{
-+    uint32_t constantInterpolationMask;
-+    uint8_t numAttributes;
-+    uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
-+};
-+
-+union SWR_DEPTH_STENCIL_STATE
-+{
-+    struct
-+    {
-+        // dword 0
-+        uint32_t depthWriteEnable : 1;
-+        uint32_t depthTestEnable : 1;
-+        uint32_t stencilWriteEnable : 1;
-+        uint32_t stencilTestEnable : 1;
-+        uint32_t doubleSidedStencilTestEnable : 1;
-+
-+        uint32_t depthTestFunc : 3;
-+        uint32_t stencilTestFunc : 3;
-+
-+        uint32_t backfaceStencilPassDepthPassOp : 3;
-+        uint32_t backfaceStencilPassDepthFailOp : 3;
-+        uint32_t backfaceStencilFailOp : 3;
-+        uint32_t backfaceStencilTestFunc : 3;
-+        uint32_t stencilPassDepthPassOp : 3;
-+        uint32_t stencilPassDepthFailOp : 3;
-+        uint32_t stencilFailOp : 3;
-+
-+        // dword 1
-+        uint8_t backfaceStencilWriteMask;
-+        uint8_t backfaceStencilTestMask;
-+        uint8_t stencilWriteMask;
-+        uint8_t stencilTestMask;
-+
-+        // dword 2
-+        uint8_t backfaceStencilRefValue;
-+        uint8_t stencilRefValue;
-+    };
-+    uint32_t value[3];
-+};
-+
-+enum SWR_SHADING_RATE
-+{
-+    SWR_SHADING_RATE_PIXEL,
-+    SWR_SHADING_RATE_SAMPLE,
-+    SWR_SHADING_RATE_COARSE,
-+    SWR_SHADING_RATE_MAX,
-+};
-+
-+// pixel shader state
-+struct SWR_PS_STATE
-+{
-+    // dword 0-1
-+    PFN_PIXEL_KERNEL pfnPixelShader;  // @llvm_pfn
-+
-+    // dword 2
-+    uint32_t killsPixel     : 1;    // pixel shader can kill pixels
-+    uint32_t writesODepth   : 1;    // pixel shader writes to depth
-+    uint32_t usesSourceDepth: 1;    // pixel shader reads depth
-+    uint32_t maxRTSlotUsed  : 3;    // maximum render target slot pixel shader writes to [0..7]
-+    uint32_t shadingRate    : 2;    // shading per pixel / sample / coarse pixel
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
-new file mode 100644
-index 0000000..915ac77
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
-@@ -0,0 +1,88 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file tessellator.h
-+*
-+* @brief Tessellator fixed function unit interface definition
-+*
-+******************************************************************************/
-+#pragma once
-+
-+/// Allocate and initialize a new tessellation context
-+HANDLE SWR_API TSInitCtx(
-+    SWR_TS_DOMAIN tsDomain,                     ///< [IN] Tessellation domain (isoline, quad, triangle)
-+    SWR_TS_PARTITIONING tsPartitioning,         ///< [IN] Tessellation partitioning algorithm
-+    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,    ///< [IN] Tessellation output topology
-+    void* pContextMem,                          ///< [IN] Memory to use for the context
-+    size_t& memSize);                           ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required
-+
-+/// Destroy & de-allocate tessellation context
-+void SWR_API TSDestroyCtx(
-+    HANDLE tsCtx);  ///< [IN] Tessellation context to be destroyed
-+
-+struct SWR_TS_TESSELLATED_DATA
-+{
-+    uint32_t NumPrimitives;
-+    uint32_t NumDomainPoints;
-+
-+    uint32_t* ppIndices[3];
-+    float* pDomainPointsU;
-+    float* pDomainPointsV;
-+    // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i]
-+};
-+
-+/// Perform Tessellation
-+void SWR_API TSTessellate(
-+    HANDLE tsCtx,                                   ///< [IN] Tessellation Context
-+    const SWR_TESSELLATION_FACTORS& tsTessFactors,  ///< [IN] Tessellation Factors
-+    SWR_TS_TESSELLATED_DATA& tsTessellatedData);    ///< [OUT] Tessellated Data
-+
-+
-+
-+/// @TODO - Implement OSS tessellator
-+
-+INLINE HANDLE SWR_API TSInitCtx(
-+    SWR_TS_DOMAIN tsDomain,
-+    SWR_TS_PARTITIONING tsPartitioning,
-+    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,
-+    void* pContextMem,
-+    size_t& memSize)
-+{
-+    SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
-+    return NULL;
-+}
-+
-+
-+INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx)
-+{
-+    SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
-+}
-+
-+
-+INLINE void SWR_API TSTessellate(
-+    HANDLE tsCtx,
-+    const SWR_TESSELLATION_FACTORS& tsTessFactors,
-+    SWR_TS_TESSELLATED_DATA& tsTessellatedData)
-+{
-+    SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
-+}
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
-new file mode 100644
-index 0000000..590bed4
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
-@@ -0,0 +1,884 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+****************************************************************************/
-+
-+#include <stdio.h>
-+#include <thread>
-+#include <algorithm>
-+#include <unordered_set>
-+#include <float.h>
-+#include <vector>
-+#include <utility>
-+#include <fstream>
-+#include <string>
-+
-+#if defined(__linux__) || defined(__gnu_linux__)
-+#include <numa.h>
-+#include <pthread.h>
-+#include <sched.h>
-+#include <unistd.h>
-+#endif
-+
-+#include "common/os.h"
-+#include "context.h"
-+#include "frontend.h"
-+#include "backend.h"
-+#include "rasterizer.h"
-+#include "rdtsc_core.h"
-+#include "tilemgr.h"
-+#include "core/multisample.h"
-+
-+// ThreadId
-+struct Core
-+{
-+    uint32_t                procGroup = 0;
-+    std::vector<uint32_t>   threadIds;
-+};
-+
-+struct NumaNode
-+{
-+    std::vector<Core> cores;
-+};
-+
-+typedef std::vector<NumaNode> CPUNumaNodes;
-+
-+void CalculateProcessorTopology(CPUNumaNodes& out_nodes)
-+{
-+    out_nodes.clear();
-+#if defined(_WIN32)
-+
-+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
-+    DWORD bufSize = sizeof(buffer);
-+
-+    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
-+    SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
-+
-+    uint32_t count = bufSize / buffer->Size;
-+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
-+
-+    for (uint32_t i = 0; i < count; ++i)
-+    {
-+        SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
-+        for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
-+        {
-+            auto& gmask = pBuffer->Processor.GroupMask[g];
-+            uint32_t threadId = 0;
-+            uint32_t procGroup = gmask.Group;
-+
-+            Core* pCore = nullptr;
-+
-+            uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
-+
-+            while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
-+            {
-+                // clear mask
-+                gmask.Mask &= ~(KAFFINITY(1) << threadId);
-+
-+                // Find Numa Node
-+                PROCESSOR_NUMBER procNum = {};
-+                procNum.Group = WORD(procGroup);
-+                procNum.Number = UCHAR(threadId);
-+
-+                uint32_t numaId = 0;
-+                ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
-+                SWR_ASSERT(ret);
-+
-+                // Store data
-+                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
-+                auto& numaNode = out_nodes[numaId];
-+
-+                uint32_t coreId = 0;
-+
-+                if (nullptr == pCore)
-+                {
-+                    numaNode.cores.push_back(Core());
-+                    pCore = &numaNode.cores.back();
-+                    pCore->procGroup = procGroup;
-+#if !defined(_WIN64)
-+                    coreId = (uint32_t)numaNode.cores.size();
-+                    if ((coreId * numThreads) >= 32)
-+                    {
-+                        // Windows doesn't return threadIds >= 32 for a processor group correctly
-+                        // when running a 32-bit application.
-+                        // Just save -1 as the threadId
-+                        threadId = uint32_t(-1);
-+                    }
-+#endif
-+                }
-+                pCore->threadIds.push_back(threadId);
-+            }
-+        }
-+        pBuffer = PtrAdd(pBuffer, pBuffer->Size);
-+    }
-+
-+
-+#elif defined(__linux__) || defined (__gnu_linux__)
-+
-+    // Parse /proc/cpuinfo to get full topology
-+    std::ifstream input("/proc/cpuinfo");
-+    std::string line;
-+    char* c;
-+    uint32_t threadId = uint32_t(-1);
-+    uint32_t coreId = uint32_t(-1);
-+    uint32_t numaId = uint32_t(-1);
-+
-+    while (std::getline(input, line))
-+    {
-+        if (line.find("processor") != std::string::npos)
-+        {
-+            if (threadId != uint32_t(-1))
-+            {
-+                // Save information.
-+                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
-+                auto& numaNode = out_nodes[numaId];
-+                if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
-+                auto& core = numaNode.cores[coreId];
-+
-+                core.procGroup = coreId;
-+                core.threadIds.push_back(threadId);
-+            }
-+
-+            auto data_start = line.find(": ") + 2;
-+            threadId = std::strtoul(&line.c_str()[data_start], &c, 10);
-+            continue;
-+        }
-+        if (line.find("core id") != std::string::npos)
-+        {
-+            auto data_start = line.find(": ") + 2;
-+            coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
-+            continue;
-+        }
-+        if (line.find("physical id") != std::string::npos)
-+        {
-+            auto data_start = line.find(": ") + 2;
-+            numaId = std::strtoul(&line.c_str()[data_start], &c, 10);
-+            continue;
-+        }
-+    }
-+
-+    if (threadId != uint32_t(-1))
-+    {
-+        // Save information.
-+        if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
-+        auto& numaNode = out_nodes[numaId];
-+        if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
-+        auto& core = numaNode.cores[coreId];
-+
-+        core.procGroup = coreId;
-+        core.threadIds.push_back(threadId);
-+    }
-+
-+    for (uint32_t node = 0; node < out_nodes.size(); node++) {
-+        auto& numaNode = out_nodes[node];
-+        auto it = numaNode.cores.begin();
-+        for ( ; it != numaNode.cores.end(); ) {
-+            if (it->threadIds.size() == 0)
-+                numaNode.cores.erase(it);
-+            else
-+                ++it;
-+        }
-+    }
-+
-+#else
-+
-+#error Unsupported platform
-+
-+#endif
-+}
-+
-+
-+void bindThread(uint32_t threadId, uint32_t procGroupId = 0)
-+{
-+#if defined(_WIN32)
-+    {
-+        GROUP_AFFINITY affinity = {};
-+        affinity.Group = procGroupId;
-+
-+#if !defined(_WIN64)
-+        if (threadId >= 32)
-+        {
-+            // In a 32-bit process on Windows it is impossible to bind
-+            // to logical processors 32-63 within a processor group.
-+            // In this case set the mask to 0 and let the system assign
-+            // the processor.  Hopefully it will make smart choices.
-+            affinity.Mask = 0;
-+        }
-+        else
-+#endif
-+        {
-+            affinity.Mask = KAFFINITY(1) << threadId;
-+        }
-+
-+        SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
-+    }
-+#else
-+    cpu_set_t cpuset;
-+    pthread_t thread = pthread_self();
-+    CPU_ZERO(&cpuset);
-+    CPU_SET(threadId, &cpuset);
-+
-+    pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
-+#endif
-+}
-+
-+INLINE
-+uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
-+{
-+    //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
-+    //return result;
-+    return pContext->DrawEnqueued;
-+}
-+
-+INLINE
-+DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId)
-+{
-+    return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT];
-+}
-+
-+// returns true if dependency not met
-+INLINE
-+bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw)
-+{
-+    return (pDC->dependency > lastRetiredDraw);
-+}
-+
-+void ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-+{
-+    // Load clear color into SIMD register...
-+    float *pClearData = (float*)(pHotTile->clearData);
-+    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
-+    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
-+    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
-+    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
-+
-+    float *pfBuf = (float*)pHotTile->pBuffer;
-+    uint32_t numSamples = pHotTile->numSamples;
-+
-+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-+    {
-+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-+        {
-+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
-+            {
-+                _simd_store_ps(pfBuf, valR);
-+                pfBuf += KNOB_SIMD_WIDTH;
-+                _simd_store_ps(pfBuf, valG);
-+                pfBuf += KNOB_SIMD_WIDTH;
-+                _simd_store_ps(pfBuf, valB);
-+                pfBuf += KNOB_SIMD_WIDTH;
-+                _simd_store_ps(pfBuf, valA);
-+                pfBuf += KNOB_SIMD_WIDTH;
-+            }
-+        }
-+    }
-+}
-+
-+void ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-+{
-+    // Load clear color into SIMD register...
-+    float *pClearData = (float*)(pHotTile->clearData);
-+    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
-+
-+    float *pfBuf = (float*)pHotTile->pBuffer;
-+    uint32_t numSamples = pHotTile->numSamples;
-+
-+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-+    {
-+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-+        {
-+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
-+            {
-+                _simd_store_ps(pfBuf, valZ);
-+                pfBuf += KNOB_SIMD_WIDTH;
-+            }
-+        }
-+    }
-+}
-+
-+void ClearStencilHotTile(const HOTTILE* pHotTile)
-+{
-+    // convert from F32 to U8.
-+    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-+    //broadcast 32x into __m256i...
-+    simdscalari valS = _simd_set1_epi8(clearVal);
-+
-+    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
-+    uint32_t numSamples = pHotTile->numSamples;
-+
-+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-+    {
-+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-+        {
-+            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
-+            {
-+                _simd_store_si(pBuf, valS);
-+                pBuf += 1;
-+            }
-+        }
-+    }
-+}
-+
-+// for draw calls, we initialize the active hot tiles and perform deferred
-+// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
-+// the draw routine itself mainly for performance, to avoid unnecessary setup
-+// every triangle
-+// @todo support deferred clear
-+INLINE
-+void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
-+{
-+    const API_STATE& state = GetApiState(pDC);
-+    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
-+    const SWR_PS_STATE& psState = state.psState;
-+    uint32_t numRTs = psState.maxRTSlotUsed + 1;
-+
-+    uint32_t x, y;
-+    MacroTileMgr::getTileIndices(macroID, x, y);
-+    x *= KNOB_MACROTILE_X_DIM;
-+    y *= KNOB_MACROTILE_Y_DIM;
-+
-+    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
-+
-+    // check RT if enabled
-+    if (state.psState.pfnPixelShader != nullptr)
-+    {
-+        for (uint32_t rt = 0; rt < numRTs; ++rt)
-+        {
-+            HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples);
-+
-+            if (pHotTile->state == HOTTILE_INVALID)
-+            {
-+                RDTSC_START(BELoadTiles);
-+                // invalid hottile before draw requires a load from surface before we can draw to it
-+                pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-+                pHotTile->state = HOTTILE_DIRTY;
-+                RDTSC_STOP(BELoadTiles, 0, 0);
-+            }
-+            else if (pHotTile->state == HOTTILE_CLEAR)
-+            {
-+                RDTSC_START(BELoadTiles);
-+                // Clear the tile.
-+                ClearColorHotTile(pHotTile);
-+                pHotTile->state = HOTTILE_DIRTY;
-+                RDTSC_STOP(BELoadTiles, 0, 0);
-+            }
-+        }
-+    }
-+
-+    // check depth if enabled
-+    if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable)
-+    {
-+        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
-+        if (pHotTile->state == HOTTILE_INVALID)
-+        {
-+            RDTSC_START(BELoadTiles);
-+            // invalid hottile before draw requires a load from surface before we can draw to it
-+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-+            pHotTile->state = HOTTILE_DIRTY;
-+            RDTSC_STOP(BELoadTiles, 0, 0);
-+        }
-+        else if (pHotTile->state == HOTTILE_CLEAR)
-+        {
-+            RDTSC_START(BELoadTiles);
-+            // Clear the tile.
-+            ClearDepthHotTile(pHotTile);
-+            pHotTile->state = HOTTILE_DIRTY;
-+            RDTSC_STOP(BELoadTiles, 0, 0);
-+        }
-+    }
-+
-+    // check stencil if enabled
-+    if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable)
-+    {
-+        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
-+        if (pHotTile->state == HOTTILE_INVALID)
-+        {
-+            RDTSC_START(BELoadTiles);
-+            // invalid hottile before draw requires a load from surface before we can draw to it
-+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-+            pHotTile->state = HOTTILE_DIRTY;
-+            RDTSC_STOP(BELoadTiles, 0, 0);
-+        }
-+        else if (pHotTile->state == HOTTILE_CLEAR)
-+        {
-+            RDTSC_START(BELoadTiles);
-+            // Clear the tile.
-+            ClearStencilHotTile(pHotTile);
-+            pHotTile->state = HOTTILE_DIRTY;
-+            RDTSC_STOP(BELoadTiles, 0, 0);
-+        }
-+    }
-+}
-+
-+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, volatile uint64_t& curDrawBE)
-+{
-+    // increment our current draw id to the first incomplete draw
-+    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
-+    while (curDrawBE < drawEnqueued)
-+    {
-+        DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
-+
-+        // If its not compute and FE is not done then break out of loop.
-+        if (!pDC->doneFE && !pDC->isCompute) break;
-+
-+        bool isWorkComplete = (pDC->isCompute) ?
-+            pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
-+
-+        if (isWorkComplete)
-+        {
-+            curDrawBE++;
-+        }
-+        else
-+        {
-+            break;
-+        }
-+    }
-+
-+    // If there are no more incomplete draws then return false.
-+    return (curDrawBE >= drawEnqueued) ? false : true;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief If there is any BE work then go work on it.
-+/// @param pContext - pointer to SWR context.
-+/// @param workerId - The unique worker ID that is assigned to this thread.
-+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
-+///                    has its own curDrawBE counter and this ensures that each worker processes all the
-+///                    draws in order.
-+/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
-+///                      own set and each time it fails to lock a macrotile, because its already locked,
-+///                      then it will add that tile to the lockedTiles set. As a worker begins to work
-+///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
-+///                      still have work pending in a previous draw. Additionally, the lockedTiles is
-+///                      hueristic that can steer a worker back to the same macrotile that it had been
-+///                      working on in a previous draw.
-+void WorkOnFifoBE(
-+    SWR_CONTEXT *pContext,
-+    uint32_t workerId,
-+    volatile uint64_t &curDrawBE,
-+    std::unordered_set<uint32_t>& lockedTiles)
-+{
-+    // Find the first incomplete draw that has pending work. If no such draw is found then
-+    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
-+    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
-+    {
-+        return;
-+    }
-+
-+    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
-+
-+    // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
-+    lockedTiles.clear();
-+
-+    // Try to work on each draw in order of the available draws in flight.
-+    //   1. If we're on curDrawBE, we can work on any macrotile that is available.
-+    //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
-+    //      working on those macrotiles that are known to be complete in the prior draw to
-+    //      maintain order. The locked tiles provides the history to ensures this.
-+    for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
-+    {
-+        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
-+
-+        if (pDC->isCompute) return; // We don't look at compute work.
-+
-+        // First wait for FE to be finished with this draw. This keeps threading model simple
-+        // but if there are lots of bubbles between draws then serializing FE and BE may
-+        // need to be revisited.
-+        if (!pDC->doneFE) break;
-+        
-+        // If this draw is dependent on a previous draw then we need to bail.
-+        if (CheckDependency(pContext, pDC, lastRetiredDraw))
-+        {
-+            return;
-+        }
-+
-+        // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
-+        std::vector<uint32_t> &macroTiles = pDC->pTileMgr->getDirtyTiles();
-+
-+        for (uint32_t tileID : macroTiles)
-+        {
-+            MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
-+            
-+            // can only work on this draw if it's not in use by other threads
-+            if (lockedTiles.find(tileID) == lockedTiles.end())
-+            {
-+                if (tile.getNumQueued())
-+                {
-+                    if (tile.tryLock())
-+                    {
-+                        BE_WORK *pWork;
-+
-+                        RDTSC_START(WorkerFoundWork);
-+
-+                        uint32_t numWorkItems = tile.getNumQueued();
-+
-+                        if (numWorkItems != 0)
-+                        {
-+                            pWork = tile.peek();
-+                            SWR_ASSERT(pWork);
-+                            if (pWork->type == DRAW)
-+                            {
-+                                InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
-+                            }
-+                        }
-+
-+                        while ((pWork = tile.peek()) != nullptr)
-+                        {
-+                            pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
-+                            tile.dequeue();
-+                        }
-+                        RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
-+
-+                        _ReadWriteBarrier();
-+
-+                        pDC->pTileMgr->markTileComplete(tileID);
-+
-+                        // Optimization: If the draw is complete and we're the last one to have worked on it then
-+                        // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
-+                        if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
-+                        {
-+                            // We can increment the current BE and safely move to next draw since we know this draw is complete.
-+                            curDrawBE++;
-+                            lastRetiredDraw++;
-+
-+                            lockedTiles.clear();
-+                            break;
-+                        }
-+                    }
-+                    else
-+                    {
-+                        // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-+                        lockedTiles.insert(tileID);
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawFE, UCHAR numaNode)
-+{
-+    // Try to grab the next DC from the ring
-+    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
-+    while (curDrawFE < drawEnqueued)
-+    {
-+        uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
-+        DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
-+        if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
-+        {
-+            curDrawFE++;
-+        }
-+        else
-+        {
-+            break;
-+        }
-+    }
-+
-+    uint64_t curDraw = curDrawFE;
-+    while (curDraw < drawEnqueued)
-+    {
-+        uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
-+        DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
-+
-+        if (!pDC->isCompute && !pDC->FeLock)
-+        {
-+            uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
-+            if (initial == 0)
-+            {
-+                // successfully grabbed the DC, now run the FE
-+                pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
-+            }
-+        }
-+        curDraw++;
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief If there is any compute work then go work on it.
-+/// @param pContext - pointer to SWR context.
-+/// @param workerId - The unique worker ID that is assigned to this thread.
-+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
-+///                    has its own curDrawBE counter and this ensures that each worker processes all the
-+///                    draws in order.
-+void WorkOnCompute(
-+    SWR_CONTEXT *pContext,
-+    uint32_t workerId,
-+    volatile uint64_t& curDrawBE)
-+{
-+    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
-+    {
-+        return;
-+    }
-+
-+    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
-+
-+    DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
-+    if (pDC->isCompute == false) return;
-+
-+    // check dependencies
-+    if (CheckDependency(pContext, pDC, lastRetiredDraw))
-+    {
-+        return;
-+    }
-+
-+    SWR_ASSERT(pDC->pDispatch != nullptr);
-+    DispatchQueue& queue = *pDC->pDispatch;
-+
-+    // Is there any work remaining?
-+    if (queue.getNumQueued() > 0)
-+    {
-+        bool lastToComplete = false;
-+
-+        uint32_t threadGroupId = 0;
-+        while (queue.getWork(threadGroupId))
-+        {
-+            ProcessComputeBE(pDC, workerId, threadGroupId);
-+
-+            lastToComplete = queue.finishedWork();
-+        }
-+
-+        _ReadWriteBarrier();
-+
-+        if (lastToComplete)
-+        {
-+            SWR_ASSERT(queue.isWorkComplete() == true);
-+            pDC->doneCompute = true;
-+        }
-+    }
-+}
-+
-+DWORD workerThread(LPVOID pData)
-+{
-+    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
-+    SWR_CONTEXT *pContext = pThreadData->pContext;
-+    uint32_t threadId = pThreadData->threadId;
-+    uint32_t workerId = pThreadData->workerId;
-+
-+    bindThread(threadId, pThreadData->procGroupId); 
-+
-+    RDTSC_INIT(threadId);
-+
-+    int numaNode = (int)pThreadData->numaId;
-+
-+    // flush denormals to 0
-+    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-+
-+    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
-+    // locked then we'll add it to this list so that we don't try and lock it again.
-+    std::unordered_set<uint32_t> lockedTiles;
-+
-+    // each worker has the ability to work on any of the queued draws as long as certain
-+    // conditions are met. the data associated
-+    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
-+    // has moved on to the next draw when he determines there is no more work to do. The api
-+    // thread will not increment the head of the dc ring until all workers have moved past the
-+    // current head.
-+    // the logic to determine what to work on is:
-+    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
-+    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
-+    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
-+    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
-+    //    trying until he reaches the tail.
-+    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
-+    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
-+    //    any work left by comparing the total # of binned work items and the total # of completed
-+    //    work items. If they are equal, then there is no more work to do for this draw, and
-+    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
-+    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
-+    while (pContext->threadPool.inThreadShutdown == false)
-+    {
-+        uint32_t loop = 0;
-+        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued)
-+        {
-+            _mm_pause();
-+        }
-+
-+        if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued)
-+        {
-+            lock.lock();
-+
-+            // check for thread idle condition again under lock
-+            if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued)
-+            {
-+                lock.unlock();
-+                continue;
-+            }
-+
-+            if (pContext->threadPool.inThreadShutdown)
-+            {
-+                lock.unlock();
-+                break;
-+            }
-+
-+            RDTSC_START(WorkerWaitForThreadEvent);
-+
-+            pContext->FifosNotEmpty.wait(lock);
-+            lock.unlock();
-+
-+            RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);
-+
-+            if (pContext->threadPool.inThreadShutdown)
-+            {
-+                break;
-+            }
-+        }
-+
-+        RDTSC_START(WorkerWorkOnFifoBE);
-+        WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles);
-+        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
-+
-+        WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]);
-+
-+        WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode);
-+    }
-+
-+    return 0;
-+}
-+
-+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
-+{
-+    // Bind application thread to HW thread 0
-+    bindThread(0);
-+
-+    CPUNumaNodes nodes;
-+    CalculateProcessorTopology(nodes);
-+
-+    uint32_t numHWNodes         = (uint32_t)nodes.size();
-+    uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
-+    uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
-+
-+    uint32_t numNodes           = numHWNodes;
-+    uint32_t numCoresPerNode    = numHWCoresPerNode;
-+    uint32_t numHyperThreads    = numHWHyperThreads;
-+
-+    if (KNOB_MAX_NUMA_NODES)
-+    {
-+        numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
-+    }
-+
-+    if (KNOB_MAX_CORES_PER_NUMA_NODE)
-+    {
-+        numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
-+    }
-+
-+    if (KNOB_MAX_THREADS_PER_CORE)
-+    {
-+        numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
-+    }
-+
-+    // Calculate numThreads
-+    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
-+
-+    if (numThreads > KNOB_MAX_NUM_THREADS)
-+    {
-+        printf("WARNING: system thread count %u exceeds max %u, "
-+            "performance will be degraded\n",
-+            numThreads, KNOB_MAX_NUM_THREADS);
-+    }
-+
-+    if (numThreads == 1)
-+    {
-+        // If only 1 worker thread, try to move it to an available
-+        // HW thread.  If that fails, use the API thread.
-+        if (numCoresPerNode < numHWCoresPerNode)
-+        {
-+            numCoresPerNode++;
-+        }
-+        else if (numHyperThreads < numHWHyperThreads)
-+        {
-+            numHyperThreads++;
-+        }
-+        else if (numNodes < numHWNodes)
-+        {
-+            numNodes++;
-+        }
-+        else
-+        {
-+            pPool->numThreads = 0;
-+            SET_KNOB(SINGLE_THREADED, true);
-+            return;
-+        }
-+    }
-+    else
-+    {
-+        // Save a HW thread for the API thread.
-+        numThreads--;
-+    }
-+
-+    pPool->numThreads = numThreads;
-+    pContext->NumWorkerThreads = pPool->numThreads;
-+
-+    pPool->inThreadShutdown = false;
-+    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
-+
-+    uint32_t workerId = 0;
-+    for (uint32_t n = 0; n < numNodes; ++n)
-+    {
-+        auto& node = nodes[n];
-+
-+        uint32_t numCores = numCoresPerNode;
-+        for (uint32_t c = 0; c < numCores; ++c)
-+        {
-+            auto& core = node.cores[c];
-+            for (uint32_t t = 0; t < numHyperThreads; ++t)
-+            {
-+                if (c == 0 && n == 0 && t == 0)
-+                {
-+                    // Skip core 0, thread0  on node 0 to reserve for API thread
-+                    continue;
-+                }
-+
-+                pPool->pThreadData[workerId].workerId = workerId;
-+                pPool->pThreadData[workerId].procGroupId = core.procGroup;
-+                pPool->pThreadData[workerId].threadId = core.threadIds[t];
-+                pPool->pThreadData[workerId].numaId = n;
-+                pPool->pThreadData[workerId].pContext = pContext;
-+                pPool->threads[workerId] = new std::thread(workerThread, &pPool->pThreadData[workerId]);
-+
-+                ++workerId;
-+            }
-+        }
-+    }
-+}
-+
-+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
-+{
-+    if (!KNOB_SINGLE_THREADED)
-+    {
-+        // Inform threads to finish up
-+        std::unique_lock<std::mutex> lock(pContext->WaitLock);
-+        pPool->inThreadShutdown = true;
-+        _mm_mfence();
-+        pContext->FifosNotEmpty.notify_all();
-+        lock.unlock();
-+
-+        // Wait for threads to finish and destroy them
-+        for (uint32_t t = 0; t < pPool->numThreads; ++t)
-+        {
-+            pPool->threads[t]->join();
-+            delete(pPool->threads[t]);
-+        }
-+
-+        // Clean up data used by threads
-+        free(pPool->pThreadData);
-+    }
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
-new file mode 100644
-index 0000000..0c91bf8
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
-@@ -0,0 +1,62 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file threads.h
-+*
-+* @brief Definitions for SWR threading model.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "knobs.h"
-+
-+#include <unordered_set>
-+#include <thread>
-+typedef std::thread* THREAD_PTR;
-+
-+struct SWR_CONTEXT;
-+
-+struct THREAD_DATA
-+{
-+    uint32_t procGroupId;   // Will always be 0 for non-Windows OS
-+    uint32_t threadId;      // within the procGroup for Windows
-+    uint32_t numaId;        // NUMA node id
-+    uint32_t workerId;
-+    SWR_CONTEXT *pContext;
-+};
-+
-+
-+struct THREAD_POOL
-+{
-+    THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
-+    uint32_t numThreads;
-+    volatile bool inThreadShutdown;
-+    THREAD_DATA *pThreadData;
-+};
-+
-+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
-+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
-+
-+// Expose FE and BE worker functions to the API thread if single threaded
-+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawFE, UCHAR numaNode);
-+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
-+void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawBE);
-diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
-new file mode 100644
-index 0000000..24b4b60
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
-@@ -0,0 +1,105 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file tilemgr.cpp
-+*
-+* @brief Implementation for Macro Tile Manager which provides the facilities
-+*        for threads to work on an macro tile.
-+*
-+******************************************************************************/
-+#include <unordered_map>
-+
-+#include "fifo.hpp"
-+#include "tilemgr.h"
-+
-+#define TILE_ID(x,y) ((x << 16 | y))
-+
-+// override new/delete for alignment
-+void *MacroTileMgr::operator new(size_t size)
-+{
-+    return _aligned_malloc(size, 64);
-+}
-+
-+void MacroTileMgr::operator delete(void *p)
-+{
-+    _aligned_free(p);
-+}
-+
-+void* DispatchQueue::operator new(size_t size)
-+{
-+    return _aligned_malloc(size, 64);
-+}
-+
-+void DispatchQueue::operator delete(void *p)
-+{
-+    _aligned_free(p);
-+}
-+
-+MacroTileMgr::MacroTileMgr()
-+{
-+}
-+
-+void MacroTileMgr::initialize()
-+{
-+    mWorkItemsProduced = 0;
-+    mWorkItemsConsumed = 0;
-+
-+    mDirtyTiles.clear();
-+}
-+
-+void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
-+{
-+    // Should not enqueue more then what we have backing for in the hot tile manager.
-+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
-+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
-+
-+    uint32_t id = TILE_ID(x, y);
-+
-+    MacroTileQueue &tile = mTiles[id];
-+    tile.mWorkItemsFE++;
-+
-+    if (tile.mWorkItemsFE == 1)
-+    {
-+        tile.clear();
-+        mDirtyTiles.push_back(id);
-+    }
-+
-+    mWorkItemsProduced++;
-+    tile.enqueue_try_nosync(pWork);
-+}
-+
-+void MacroTileMgr::markTileComplete(uint32_t id)
-+{
-+    SWR_ASSERT(mTiles.find(id) != mTiles.end());
-+    MacroTileQueue &tile = mTiles[id];
-+    uint32_t numTiles = tile.mWorkItemsFE;
-+    InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
-+
-+    _ReadWriteBarrier();
-+    tile.mWorkItemsBE += numTiles;
-+    SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
-+
-+    // clear out tile, but defer fifo clear until the next DC first queues to it.
-+    // this prevents worker threads from constantly locking a completed macro tile
-+    tile.mWorkItemsFE = 0;
-+    tile.mWorkItemsBE = 0;
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
-new file mode 100644
-index 0000000..b537730
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
-@@ -0,0 +1,392 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file tilemgr.h
-+*
-+* @brief Definitions for Macro Tile Manager which provides the facilities
-+*        for threads to work on an macro tile.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include <set>
-+#include <unordered_map>
-+#include "common/formats.h"
-+#include "fifo.hpp"
-+#include "context.h"
-+#include "format_traits.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// MacroTile - work queue for a tile.
-+//////////////////////////////////////////////////////////////////////////
-+struct MacroTileQueue
-+{
-+    MacroTileQueue()
-+    {
-+        mFifo.initialize();
-+    }
-+
-+    ~MacroTileQueue() { }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Returns number of work items queued for this tile.
-+    uint32_t getNumQueued()
-+    {
-+        return mFifo.getNumQueued();
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Attempt to lock the work fifo. If already locked then return false.
-+    bool tryLock()
-+    {
-+        return mFifo.tryLock();
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Clear fifo and unlock it.
-+    void clear()
-+    {
-+        mFifo.clear();
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Peek at work sitting at the front of the fifo.
-+    BE_WORK* peek()
-+    {
-+        return mFifo.peek();
-+    }
-+
-+    bool enqueue_try_nosync(const BE_WORK* entry)
-+    {
-+        return mFifo.enqueue_try_nosync(entry);
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Move to next work item
-+    void dequeue()
-+    {
-+        mFifo.dequeue_noinc();
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Destroy fifo
-+    void destroy()
-+    {
-+        mFifo.destroy();
-+    }
-+
-+    ///@todo This will all be private.
-+    uint32_t mWorkItemsFE = 0;
-+    uint32_t mWorkItemsBE = 0;
-+
-+private:
-+    QUEUE<BE_WORK> mFifo;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// MacroTileMgr - Manages macrotiles for a draw.
-+//////////////////////////////////////////////////////////////////////////
-+class MacroTileMgr
-+{
-+public:
-+    MacroTileMgr();
-+    ~MacroTileMgr()
-+    {
-+        for (auto &tile : mTiles)
-+        {
-+            tile.second.destroy();
-+        }
-+    }
-+
-+    void initialize();
-+    INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
-+    INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
-+    void markTileComplete(uint32_t id);
-+
-+    INLINE bool isWorkComplete()
-+    {
-+        return mWorkItemsProduced == mWorkItemsConsumed;
-+    }
-+
-+    void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork);
-+
-+    static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
-+    {
-+        y = tileID & 0xffff;
-+        x = (tileID >> 16) & 0xffff;
-+    }
-+
-+    void *operator new(size_t size);
-+    void operator delete (void *p);
-+
-+private:
-+    SWR_FORMAT mFormat;
-+    std::unordered_map<uint32_t, MacroTileQueue> mTiles;
-+
-+    // Any tile that has work queued to it is a dirty tile.
-+    std::vector<uint32_t> mDirtyTiles;
-+
-+    OSALIGNLINE(LONG) mWorkItemsProduced;
-+    OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// DispatchQueue - work queue for dispatch
-+//////////////////////////////////////////////////////////////////////////
-+class DispatchQueue
-+{
-+public:
-+    DispatchQueue() {}
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Setup the producer consumer counts.
-+    void initialize(uint32_t totalTasks, void* pTaskData)
-+    {
-+        // The available and outstanding counts start with total tasks.
-+        // At the start there are N tasks available and outstanding.
-+        // When both the available and outstanding counts have reached 0 then all work has completed.
-+        // When a worker starts on a threadgroup then it decrements the available count.
-+        // When a worker completes a threadgroup then it decrements the outstanding count.
-+
-+        mTasksAvailable = totalTasks;
-+        mTasksOutstanding = totalTasks;
-+
-+        mpTaskData = pTaskData;
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Returns number of tasks available for this dispatch.
-+    uint32_t getNumQueued()
-+    {
-+        return (mTasksAvailable > 0) ? mTasksAvailable : 0;
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Atomically decrement the work available count. If the result
-+    //         is greater than 0 then we can on the associated thread group.
-+    //         Otherwise, there is no more work to do.
-+    bool getWork(uint32_t& groupId)
-+    {
-+        LONG result = InterlockedDecrement(&mTasksAvailable);
-+
-+        if (result >= 0)
-+        {
-+            groupId = result;
-+            return true;
-+        }
-+
-+        return false;
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Atomically decrement the outstanding count. A worker is notifying
-+    ///        us that he just finished some work. Also, return true if we're
-+    ///        the last worker to complete this dispatch.
-+    bool finishedWork()
-+    {
-+        LONG result = InterlockedDecrement(&mTasksOutstanding);
-+        SWR_ASSERT(result >= 0, "Should never oversubscribe work");
-+
-+        return (result == 0) ? true : false;
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Work is complete once both the available/outstanding counts have reached 0.
-+    bool isWorkComplete()
-+    {
-+        return ((mTasksAvailable <= 0) &&
-+                (mTasksOutstanding <= 0));
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Return pointer to task data.
-+    const void* GetTasksData()
-+    {
-+        return mpTaskData;
-+    }
-+
-+    void *operator new(size_t size);
-+    void operator delete (void *p);
-+
-+    void* mpTaskData;        // The API thread will set this up and the callback task function will interpet this.
-+
-+    OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
-+    OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
-+};
-+
-+
-+enum HOTTILE_STATE
-+{
-+    HOTTILE_INVALID,        // tile is in unitialized state and should be loaded with surface contents before rendering
-+    HOTTILE_CLEAR,          // tile should be cleared
-+    HOTTILE_DIRTY,          // tile has been rendered to
-+    HOTTILE_RESOLVED,       // tile has been stored to memory
-+};
-+
-+struct HOTTILE
-+{
-+    BYTE *pBuffer;
-+    HOTTILE_STATE state;
-+    DWORD clearData[4];                 // May need to change based on pfnClearTile implementation.  Reorder for alignment?
-+    uint32_t numSamples;
-+    uint32_t renderTargetArrayIndex;    // current render target array index loaded
-+};
-+
-+union HotTileSet
-+{
-+    struct
-+    {
-+        HOTTILE Color[SWR_NUM_RENDERTARGETS];
-+        HOTTILE Depth;
-+        HOTTILE Stencil;
-+    };
-+    HOTTILE Attachment[SWR_NUM_ATTACHMENTS];
-+};
-+
-+class HotTileMgr
-+{
-+public:
-+    HotTileMgr()
-+    {
-+        memset(&mHotTiles[0][0], 0, sizeof(mHotTiles));
-+
-+        // cache hottile size
-+        for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
-+        {
-+            mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
-+        }
-+        mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-+        mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
-+    }
-+
-+    ~HotTileMgr()
-+    {
-+        for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x)
-+        {
-+            for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y)
-+            {
-+                for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
-+                {
-+                    if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
-+                    {
-+                        _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
-+                        mHotTiles[x][y].Attachment[a].pBuffer = NULL;
-+                    }
-+                }
-+            }
-+        }
-+    }
-+
-+    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, 
-+        uint32_t renderTargetArrayIndex = 0)
-+    {
-+        uint32_t x, y;
-+        MacroTileMgr::getTileIndices(macroID, x, y);
-+
-+        assert(x < KNOB_NUM_HOT_TILES_X);
-+        assert(y < KNOB_NUM_HOT_TILES_Y);
-+
-+        HotTileSet &tile = mHotTiles[x][y];
-+        HOTTILE& hotTile = tile.Attachment[attachment];
-+        if (hotTile.pBuffer == NULL)
-+        {
-+            if (create)
-+            {
-+                uint32_t size = numSamples * mHotTileSize[attachment];
-+                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-+                hotTile.state = HOTTILE_INVALID;
-+                hotTile.numSamples = numSamples;
-+                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-+            }
-+            else
-+            {
-+                return NULL;
-+            }
-+        }
-+        else
-+        {
-+            // free the old tile and create a new one with enough space to hold all samples
-+            if (numSamples > hotTile.numSamples)
-+            {
-+                // tile should be either uninitialized or resolved if we're deleting and switching to a 
-+                // new sample count
-+                assert((hotTile.state == HOTTILE_INVALID) ||
-+                       (hotTile.state == HOTTILE_RESOLVED));
-+                _aligned_free(hotTile.pBuffer);
-+
-+                uint32_t size = numSamples * mHotTileSize[attachment];
-+                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-+                hotTile.state = HOTTILE_INVALID;
-+                hotTile.numSamples = numSamples;
-+            }
-+
-+            // if requested render target array index isn't currently loaded, need to store out the current hottile 
-+            // and load the requested array slice
-+            if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
-+            {
-+                SWR_FORMAT format;
-+                switch (attachment)
-+                {
-+                case SWR_ATTACHMENT_COLOR0:
-+                case SWR_ATTACHMENT_COLOR1:
-+                case SWR_ATTACHMENT_COLOR2:
-+                case SWR_ATTACHMENT_COLOR3:
-+                case SWR_ATTACHMENT_COLOR4:
-+                case SWR_ATTACHMENT_COLOR5:
-+                case SWR_ATTACHMENT_COLOR6:
-+                case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-+                case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-+                case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-+                default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-+                }
-+
-+                if (hotTile.state == HOTTILE_DIRTY)
-+                {
-+                    pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
-+                        x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
-+                }
-+
-+                pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
-+                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
-+
-+                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-+                hotTile.state = HOTTILE_DIRTY;
-+            }
-+        }
-+        return &tile.Attachment[attachment];
-+    }
-+
-+    HotTileSet &GetHotTile(uint32_t macroID)
-+    {
-+        uint32_t x, y;
-+        MacroTileMgr::getTileIndices(macroID, x, y);
-+        assert(x < KNOB_NUM_HOT_TILES_X);
-+        assert(y < KNOB_NUM_HOT_TILES_Y);
-+
-+        return mHotTiles[x][y];
-+    }
-+
-+private:
-+    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
-+    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
-+};
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
-new file mode 100644
-index 0000000..f36452f
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
-@@ -0,0 +1,148 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file utils.cpp
-+*
-+* @brief Utilities used by SWR core.
-+*
-+******************************************************************************/
-+#if defined(_WIN32)
-+
-+#include<Windows.h>
-+#include <Gdiplus.h>
-+#include <Gdiplusheaders.h>
-+#include <cstdint>
-+
-+using namespace Gdiplus;
-+
-+int GetEncoderClsid(const WCHAR* format, CLSID* pClsid)
-+{
-+   uint32_t  num = 0;          // number of image encoders
-+   uint32_t  size = 0;         // size of the image encoder array in bytes
-+
-+   ImageCodecInfo* pImageCodecInfo = nullptr;
-+
-+   GetImageEncodersSize(&num, &size);
-+   if(size == 0)
-+      return -1;  // Failure
-+
-+   pImageCodecInfo = (ImageCodecInfo*)(malloc(size));
-+   if(pImageCodecInfo == nullptr)
-+      return -1;  // Failure
-+
-+   GetImageEncoders(num, size, pImageCodecInfo);
-+
-+   for(uint32_t j = 0; j < num; ++j)
-+   {
-+      if( wcscmp(pImageCodecInfo[j].MimeType, format) == 0 )
-+      {
-+         *pClsid = pImageCodecInfo[j].Clsid;
-+         free(pImageCodecInfo);
-+         return j;  // Success
-+      }    
-+   }
-+
-+   free(pImageCodecInfo);
-+   return -1;  // Failure
-+}
-+
-+void SaveImageToPNGFile(
-+    const WCHAR *pFilename,
-+    void *pBuffer,
-+    uint32_t width,
-+    uint32_t height)
-+{
-+    // dump pixels to a png
-+    // Initialize GDI+.
-+    GdiplusStartupInput gdiplusStartupInput;
-+    ULONG_PTR gdiplusToken;
-+    GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
-+
-+    Bitmap *bitmap = new Bitmap(width, height);
-+    BYTE *pBytes = (BYTE*)pBuffer;
-+    static const uint32_t bytesPerPixel = 4;
-+    for (uint32_t y = 0; y < height; ++y)
-+        for (uint32_t x = 0; x < width; ++x)
-+        {
-+            uint32_t pixel = *(uint32_t*)pBytes;
-+            if (pixel == 0xcdcdcdcd)
-+            {
-+                pixel = 0xFFFF00FF;
-+            }
-+            else if (pixel == 0xdddddddd)
-+            {
-+                pixel = 0x80FF0000;
-+            }
-+            else
-+            {
-+                pixel |= 0xFF000000;
-+            }
-+            Color color(pixel);
-+            bitmap->SetPixel(x, y, color);
-+            pBytes += bytesPerPixel;
-+        }
-+
-+    // Save image.
-+    CLSID pngClsid;
-+    GetEncoderClsid(L"image/png", &pngClsid);
-+    bitmap->Save(pFilename, &pngClsid, nullptr);
-+
-+    delete bitmap;
-+
-+    GdiplusShutdown(gdiplusToken);
-+}
-+
-+void OpenBitmapFromFile(
-+    const WCHAR *pFilename,
-+    void **pBuffer,
-+    uint32_t *width,
-+    uint32_t *height)
-+{
-+    GdiplusStartupInput gdiplusStartupInput;
-+    ULONG_PTR gdiplusToken;
-+    GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
-+
-+    Bitmap *bitmap  = new Bitmap(pFilename);
-+
-+    *width          = bitmap->GetWidth();
-+    *height         = bitmap->GetHeight();
-+    *pBuffer        = new BYTE[*width * *height * 4]; // width * height * |RGBA|
-+
-+    // The folder 'stb_image' contains a PNG open/close module which
-+    // is far less painful than this is, yo.
-+    Gdiplus::Color clr;
-+    for (uint32_t y = 0, idx = 0; y < *height; ++y)
-+    {
-+        for (uint32_t x = 0; x < *width; ++x, idx += 4)
-+        {
-+            bitmap->GetPixel(x, *height - y - 1, &clr);
-+            ((BYTE*)*pBuffer)[idx + 0] = clr.GetBlue();
-+            ((BYTE*)*pBuffer)[idx + 1] = clr.GetGreen();
-+            ((BYTE*)*pBuffer)[idx + 2] = clr.GetRed();
-+            ((BYTE*)*pBuffer)[idx + 3] = clr.GetAlpha();
-+        }
-+    }
-+
-+    delete bitmap;
-+    bitmap = 0;
-+}
-+#endif
-diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
-new file mode 100644
-index 0000000..63d6ca1
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
-@@ -0,0 +1,745 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file utils.h
-+*
-+* @brief Utilities used by SWR core.
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include <string.h>
-+#include "common/os.h"
-+#include "common/simdintrin.h"
-+#include "common/swr_assert.h"
-+
-+#if defined(_WIN32)
-+void SaveImageToPNGFile(
-+    const WCHAR *pFilename,
-+    void *pBuffer,
-+    uint32_t width,
-+    uint32_t height);
-+
-+void OpenBitmapFromFile(
-+    const WCHAR *pFilename,
-+    void **pBuffer,
-+    uint32_t *width,
-+    uint32_t *height);
-+#endif
-+
-+/// @todo assume linux is always 64 bit
-+#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
-+#define _MM_INSERT_EPI64 _mm_insert_epi64
-+#define _MM_EXTRACT_EPI64 _mm_extract_epi64
-+#else
-+INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
-+{
-+    OSALIGNLINE(uint32_t) elems[4];
-+    _mm_store_si128((__m128i*)elems, a);
-+    if (ndx == 0)
-+    {
-+        uint64_t foo = elems[0];
-+        foo |= (uint64_t)elems[1] << 32;
-+        return foo;
-+    } 
-+    else
-+    {
-+        uint64_t foo = elems[2];
-+        foo |= (uint64_t)elems[3] << 32;
-+        return foo;
-+    }
-+}
-+
-+INLINE __m128i  _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
-+{
-+    OSALIGNLINE(int64_t) elems[2];
-+    _mm_store_si128((__m128i*)elems, a);
-+    if (ndx == 0)
-+    {
-+        elems[0] = b;
-+    }
-+    else
-+    {
-+        elems[1] = b;
-+    }
-+    __m128i out;
-+    out = _mm_load_si128((const __m128i*)elems);
-+    return out;
-+}
-+#endif
-+
-+OSALIGNLINE(struct) BBOX
-+{
-+    int top, bottom, left, right;
-+
-+    BBOX() {}
-+    BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
-+
-+    bool operator==(const BBOX& rhs)
-+    {
-+        return (this->top == rhs.top &&
-+            this->bottom == rhs.bottom &&
-+            this->left == rhs.left &&
-+            this->right == rhs.right);
-+    }
-+
-+    bool operator!=(const BBOX& rhs)
-+    {
-+        return !(*this == rhs);
-+    }
-+};
-+
-+struct simdBBox
-+{
-+    simdscalari top, bottom, left, right;
-+};
-+
-+INLINE
-+void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
-+{
-+    __m128i row0i = _mm_castps_si128(row0);
-+    __m128i row1i = _mm_castps_si128(row1);
-+    __m128i row2i = _mm_castps_si128(row2);
-+    __m128i row3i = _mm_castps_si128(row3);
-+
-+    __m128i vTemp = row2i;
-+    row2i = _mm_unpacklo_epi32(row2i, row3i);
-+    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
-+
-+    row3i = row0i;
-+    row0i = _mm_unpacklo_epi32(row0i, row1i);
-+    row3i = _mm_unpackhi_epi32(row3i, row1i);
-+
-+    row1i = row0i;
-+    row0i = _mm_unpacklo_epi64(row0i, row2i);
-+    row1i = _mm_unpackhi_epi64(row1i, row2i);
-+
-+    row2i = row3i;
-+    row2i = _mm_unpacklo_epi64(row2i, vTemp);
-+    row3i = _mm_unpackhi_epi64(row3i, vTemp);
-+
-+    row0 = _mm_castsi128_ps(row0i);
-+    row1 = _mm_castsi128_ps(row1i);
-+    row2 = _mm_castsi128_ps(row2i);
-+    row3 = _mm_castsi128_ps(row3i);
-+}
-+
-+INLINE
-+void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
-+{
-+    __m128i vTemp = row2;
-+    row2 = _mm_unpacklo_epi32(row2, row3);
-+    vTemp = _mm_unpackhi_epi32(vTemp, row3);
-+
-+    row3 = row0;
-+    row0 = _mm_unpacklo_epi32(row0, row1);
-+    row3 = _mm_unpackhi_epi32(row3, row1);
-+
-+    row1 = row0;
-+    row0 = _mm_unpacklo_epi64(row0, row2);
-+    row1 = _mm_unpackhi_epi64(row1, row2);
-+
-+    row2 = row3;
-+    row2 = _mm_unpacklo_epi64(row2, vTemp);
-+    row3 = _mm_unpackhi_epi64(row3, vTemp);
-+}
-+
-+#define GCC_VERSION (__GNUC__ * 10000 \
-+                     + __GNUC_MINOR__ * 100 \
-+                     + __GNUC_PATCHLEVEL__)
-+
-+#if defined(__GNUC__) && (GCC_VERSION < 40900)
-+#define _mm_undefined_ps _mm_setzero_ps
-+#define _mm_undefined_si128 _mm_setzero_si128
-+#if KNOB_SIMD_WIDTH == 8
-+#define _mm256_undefined_ps _mm256_setzero_ps
-+#endif
-+#endif
-+
-+#if KNOB_SIMD_WIDTH == 8
-+INLINE
-+void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
-+{
-+    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
-+    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
-+    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
-+    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
-+
-+    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
-+    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
-+    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
-+    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
-+
-+    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-+    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-+    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-+    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
-+
-+    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
-+    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
-+    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
-+    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
-+}
-+
-+INLINE
-+void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
-+{
-+    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
-+    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
-+    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
-+    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
-+
-+    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
-+    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
-+    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
-+    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
-+
-+    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-+    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-+    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-+    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
-+
-+    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
-+    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
-+    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
-+    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
-+}
-+
-+INLINE
-+void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
-+{
-+    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
-+    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
-+    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
-+    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
-+    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
-+    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
-+    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
-+    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
-+    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
-+    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
-+    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
-+    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
-+    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
-+    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
-+    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
-+    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
-+    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
-+    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
-+    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
-+    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
-+    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
-+    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
-+    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
-+    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
-+}
-+
-+INLINE
-+void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
-+{
-+    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 
-+        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
-+}
-+#endif
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// TranposeSingleComponent
-+//////////////////////////////////////////////////////////////////////////
-+template<uint32_t bpp>
-+struct TransposeSingleComponent
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Pass-thru for single component.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose8_8_8_8
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose8_8_8_8
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-+#if KNOB_SIMD_WIDTH == 8
-+#if KNOB_ARCH == KNOB_ARCH_AVX
-+        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
-+        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
-+        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
-+        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
-+        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
-+        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
-+        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
-+        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
-+        _mm_store_si128((__m128i*)pDst, c0123lo);
-+        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
-+#elif KNOB_ARCH == KNOB_ARCH_AVX2
-+        simdscalari dst01 = _mm256_shuffle_epi8(src,
-+            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
-+        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
-+        dst23 = _mm256_shuffle_epi8(dst23,
-+            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
-+        simdscalari dst = _mm256_or_si256(dst01, dst23);
-+        _simd_store_si((simdscalari*)pDst, dst);
-+#endif
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose8_8_8
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose8_8_8
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose8_8
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose8_8
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
-+
-+#if KNOB_SIMD_WIDTH == 8
-+        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
-+        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
-+        rg = _mm_unpacklo_epi8(rg, g);
-+        _mm_store_si128((__m128i*)pDst, rg);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose32_32_32_32
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose32_32_32_32
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-+        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-+        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-+        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
-+
-+        __m128 vDst[8];
-+        vTranspose4x8(vDst, src0, src1, src2, src3);
-+        _mm_store_ps((float*)pDst, vDst[0]);
-+        _mm_store_ps((float*)pDst+4, vDst[1]);
-+        _mm_store_ps((float*)pDst+8, vDst[2]);
-+        _mm_store_ps((float*)pDst+12, vDst[3]);
-+        _mm_store_ps((float*)pDst+16, vDst[4]);
-+        _mm_store_ps((float*)pDst+20, vDst[5]);
-+        _mm_store_ps((float*)pDst+24, vDst[6]);
-+        _mm_store_ps((float*)pDst+28, vDst[7]);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose32_32_32
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose32_32_32
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalar src0 = _simd_load_ps((const float*)pSrc);
-+        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
-+        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
-+
-+        __m128 vDst[8];
-+        vTranspose3x8(vDst, src0, src1, src2);
-+        _mm_store_ps((float*)pDst, vDst[0]);
-+        _mm_store_ps((float*)pDst + 4, vDst[1]);
-+        _mm_store_ps((float*)pDst + 8, vDst[2]);
-+        _mm_store_ps((float*)pDst + 12, vDst[3]);
-+        _mm_store_ps((float*)pDst + 16, vDst[4]);
-+        _mm_store_ps((float*)pDst + 20, vDst[5]);
-+        _mm_store_ps((float*)pDst + 24, vDst[6]);
-+        _mm_store_ps((float*)pDst + 28, vDst[7]);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose32_32
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose32_32
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+        const float* pfSrc = (const float*)pSrc;
-+        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
-+        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
-+        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
-+        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
-+
-+        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
-+        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
-+        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
-+        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
-+
-+        float* pfDst = (float*)pDst;
-+        _mm_store_ps(pfDst + 0, dst0);
-+        _mm_store_ps(pfDst + 4, dst1);
-+        _mm_store_ps(pfDst + 8, dst2);
-+        _mm_store_ps(pfDst + 12, dst3);
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose16_16_16_16
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose16_16_16_16
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-+        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
-+
-+        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-+        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-+        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
-+        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
-+
-+        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-+        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-+        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-+        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-+
-+        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-+        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-+        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-+        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-+
-+        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-+        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-+        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-+        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose16_16_16
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose16_16_16
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+#if KNOB_SIMD_WIDTH == 8
-+        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
-+
-+        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-+        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-+        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
-+        __m128i src_a = _mm_undefined_si128();
-+
-+        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-+        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-+        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-+        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-+
-+        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-+        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-+        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-+        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-+
-+        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-+        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-+        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-+        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose16_16
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose16_16
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
-+    {
-+        simdscalar src = _simd_load_ps((const float*)pSrc);
-+
-+#if KNOB_SIMD_WIDTH == 8
-+        __m128 comp0 = _mm256_castps256_ps128(src);
-+        __m128 comp1 = _mm256_extractf128_ps(src, 1);
-+
-+        __m128i comp0i = _mm_castps_si128(comp0);
-+        __m128i comp1i = _mm_castps_si128(comp1);
-+
-+        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
-+        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
-+
-+        _mm_store_si128((__m128i*)pDst, resLo);
-+        _mm_store_si128((__m128i*)pDst + 1, resHi);
-+#else
-+#error Unsupported vector width
-+#endif
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose4_4_4_4
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose4_4_4_4
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose5_6_5
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose5_6_5
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose9_9_9_5
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose9_9_9_5
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose5_5_5_1
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose5_5_5_1
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose10_10_10_2
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose10_10_10_2
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Transpose11_11_10
-+//////////////////////////////////////////////////////////////////////////
-+struct Transpose11_11_10
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
-+    /// @param pSrc - source data in SOA form
-+    /// @param pDst - output data in AOS form
-+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
-+};
-+
-+// helper function to unroll loops
-+template<int Begin, int End, int Step = 1>
-+struct UnrollerL {
-+    template<typename Lambda>
-+    INLINE static void step(Lambda& func) {
-+        func(Begin);
-+        UnrollerL<Begin + Step, End, Step>::step(func);
-+    }
-+};
-+
-+template<int End, int Step>
-+struct UnrollerL<End, End, Step> {
-+    template<typename Lambda>
-+    static void step(Lambda& func) {
-+    }
-+};
-+
-+// general CRC compute
-+INLINE
-+uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
-+{
-+#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
-+    uint32_t sizeInQwords = size / sizeof(uint64_t);
-+    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
-+    uint64_t* pDataWords = (uint64_t*)pData;
-+    for (uint32_t i = 0; i < sizeInQwords; ++i)
-+    {
-+        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
-+    }
-+#else
-+    uint32_t sizeInDwords = size / sizeof(uint32_t);
-+    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
-+    uint32_t* pDataWords = (uint32_t*)pData;
-+    for (uint32_t i = 0; i < sizeInDwords; ++i)
-+    {
-+        crc = _mm_crc32_u32(crc, *pDataWords++);
-+    }
-+#endif
-+
-+    BYTE* pRemainderBytes = (BYTE*)pDataWords;
-+    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
-+    {
-+        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
-+    }
-+
-+    return crc;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Add byte offset to any-type pointer
-+//////////////////////////////////////////////////////////////////////////
-+template <typename T>
-+INLINE
-+static T* PtrAdd(T* p, intptr_t offset)
-+{
-+    intptr_t intp = reinterpret_cast<intptr_t>(p);
-+    return reinterpret_cast<T*>(intp + offset);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Is a power-of-2?
-+//////////////////////////////////////////////////////////////////////////
-+template <typename T>
-+INLINE
-+static bool IsPow2(T value)
-+{
-+    return value == (value & (0 - value));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Align down to specified alignment
-+/// Note: IsPow2(alignment) MUST be true
-+//////////////////////////////////////////////////////////////////////////
-+template <typename T1, typename T2>
-+INLINE
-+static T1 AlignDownPow2(T1 value, T2 alignment)
-+{
-+    SWR_ASSERT(IsPow2(alignment));
-+    return value & ~T1(alignment - 1);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Align up to specified alignment
-+/// Note: IsPow2(alignment) MUST be true
-+//////////////////////////////////////////////////////////////////////////
-+template <typename T1, typename T2>
-+INLINE
-+static T1 AlignUpPow2(T1 value, T2 alignment)
-+{
-+    return AlignDownPow2(value + T1(alignment - 1), alignment);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Align down to specified alignment
-+//////////////////////////////////////////////////////////////////////////
-+template <typename T1, typename T2>
-+INLINE
-+static T1 AlignDown(T1 value, T2 alignment)
-+{
-+    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
-+    return value - T1(value % alignment);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Align up to specified alignment
-+/// Note: IsPow2(alignment) MUST be true
-+//////////////////////////////////////////////////////////////////////////
-+template <typename T1, typename T2>
-+INLINE
-+static T1 AlignUp(T1 value, T2 alignment)
-+{
-+    return AlignDown(value + T1(alignment - 1), alignment);
-+}
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
-new file mode 100644
-index 0000000..726b508
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
-@@ -0,0 +1,292 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file JitManager.cpp
-+* 
-+* @brief Implementation if the Jit Manager.
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#if defined(_WIN32)
-+#pragma warning(disable: 4800 4146 4244 4267 4355 4996)
-+#endif
-+
-+#include "jit_api.h"
-+#include "JitManager.h"
-+#include "fetch_jit.h"
-+
-+#if defined(_WIN32)
-+#include "llvm/ADT/Triple.h"
-+#endif
-+#include "llvm/IR/Function.h"
-+#include "llvm/Support/DynamicLibrary.h"
-+
-+#include "llvm/Support/MemoryBuffer.h"
-+#include "llvm/Support/SourceMgr.h"
-+#include "llvm/IRReader/IRReader.h"
-+
-+#include "core/state.h"
-+#include "common/containers.hpp"
-+
-+#include "state_llvm.h"
-+
-+#include <sstream>
-+#if defined(_WIN32)
-+#include <psapi.h>
-+#include <cstring>
-+
-+#define INTEL_OUTPUT_DIR "c:\\Intel"
-+#define RASTY_OUTPUT_DIR INTEL_OUTPUT_DIR "\\Rasty"
-+#define JITTER_OUTPUT_DIR RASTY_OUTPUT_DIR "\\Jitter"
-+#endif
-+
-+using namespace llvm;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Contructor for JitManager.
-+/// @param simdWidth - SIMD width to be used in generated program.
-+JitManager::JitManager(uint32_t simdWidth, const char *arch)
-+    : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch)
-+{
-+    InitializeNativeTarget();
-+    InitializeNativeTargetAsmPrinter();
-+    InitializeNativeTargetDisassembler();
-+
-+    TargetOptions    tOpts;
-+    tOpts.AllowFPOpFusion = FPOpFusion::Fast;
-+    tOpts.NoInfsFPMath = false;
-+    tOpts.NoNaNsFPMath = false;
-+    tOpts.UnsafeFPMath = true;
-+#if defined(_DEBUG)
-+    tOpts.NoFramePointerElim = true;
-+#endif
-+
-+    //tOpts.PrintMachineCode    = true;
-+
-+    std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-+    fnName << mJitNumber++;
-+    std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
-+    mpCurrentModule = newModule.get();
-+
-+    auto &&EB = EngineBuilder(std::move(newModule));
-+    EB.setTargetOptions(tOpts);
-+    EB.setOptLevel(CodeGenOpt::Aggressive);
-+
-+    StringRef hostCPUName;
-+
-+    // force JIT to use the same CPU arch as the rest of rasty
-+    if(mArch.AVX512F())
-+    {
-+        assert(0 && "Implement AVX512 jitter");
-+        hostCPUName = sys::getHostCPUName();
-+        if (mVWidth == 0)
-+        {
-+            mVWidth = 16;
-+        }
-+    }
-+    else if(mArch.AVX2())
-+    {
-+        hostCPUName = StringRef("core-avx2");
-+        if (mVWidth == 0)
-+        {
-+            mVWidth = 8;
-+        }
-+    }
-+    else if(mArch.AVX())
-+    {
-+        if (mArch.F16C())
-+        {
-+            hostCPUName = StringRef("core-avx-i");
-+        }
-+        else
-+        {
-+            hostCPUName = StringRef("corei7-avx");
-+        }
-+        if (mVWidth == 0)
-+        {
-+            mVWidth = 8;
-+        }
-+    }
-+    else
-+    {
-+        hostCPUName = sys::getHostCPUName();
-+        if (mVWidth == 0)
-+        {
-+            mVWidth = 8; // 4?
-+        }
-+    }
-+
-+    EB.setMCPU(hostCPUName);
-+
-+#if defined(_WIN32)
-+    // Needed for MCJIT on windows
-+    Triple hostTriple(sys::getProcessTriple());
-+    hostTriple.setObjectFormat(Triple::ELF);
-+    mpCurrentModule->setTargetTriple(hostTriple.getTriple());
-+#endif // _WIN32
-+
-+    mpExec = EB.create();
-+
-+#if LLVM_USE_INTEL_JITEVENTS
-+    JITEventListener *vTune = JITEventListener::createIntelJITEventListener();
-+    mpExec->RegisterJITEventListener(vTune);
-+#endif
-+
-+    mFP32Ty = Type::getFloatTy(mContext);   // float type
-+    mInt8Ty = Type::getInt8Ty(mContext);
-+    mInt32Ty = Type::getInt32Ty(mContext);   // int type
-+    mInt64Ty = Type::getInt64Ty(mContext);   // int type
-+    mV4FP32Ty = StructType::get(mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
-+    mV4Int32Ty = StructType::get(mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
-+
-+    // fetch function signature
-+    // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
-+    std::vector<Type*> fsArgs;
-+    fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
-+    fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0));
-+
-+    mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false);
-+
-+    mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth);
-+    mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth);
-+
-+    mSimdVectorTy = StructType::get(mContext, std::vector<Type*>(4, mSimtFP32Ty), false);
-+    mSimdVectorInt32Ty = StructType::get(mContext, std::vector<Type*>(4, mSimtInt32Ty), false);
-+
-+#if defined(_WIN32)
-+    // explicitly instantiate used symbols from potentially staticly linked libs
-+    sys::DynamicLibrary::AddSymbol("exp2f", &exp2f);
-+    sys::DynamicLibrary::AddSymbol("log2f", &log2f);
-+    sys::DynamicLibrary::AddSymbol("sinf", &sinf);
-+    sys::DynamicLibrary::AddSymbol("cosf", &cosf);
-+    sys::DynamicLibrary::AddSymbol("powf", &powf);
-+#endif
-+
-+#if defined(_WIN32)
-+    if (KNOB_DUMP_SHADER_IR)
-+    {
-+        CreateDirectory(INTEL_OUTPUT_DIR, NULL);
-+        CreateDirectory(RASTY_OUTPUT_DIR, NULL);
-+        CreateDirectory(JITTER_OUTPUT_DIR, NULL);
-+    }
-+#endif
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Create new LLVM module.
-+void JitManager::SetupNewModule()
-+{
-+    SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!");
-+    
-+    std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-+    fnName << mJitNumber++;
-+    std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
-+    mpCurrentModule = newModule.get();
-+#if defined(_WIN32)
-+    // Needed for MCJIT on windows
-+    Triple hostTriple(sys::getProcessTriple());
-+    hostTriple.setObjectFormat(Triple::ELF);
-+    newModule->setTargetTriple(hostTriple.getTriple());
-+#endif // _WIN32
-+
-+    mpExec->addModule(std::move(newModule));
-+    mIsModuleFinalized = false;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Create new LLVM module from IR.
-+bool JitManager::SetupModuleFromIR(const uint8_t *pIR)
-+{
-+    std::unique_ptr<MemoryBuffer> pMem = MemoryBuffer::getMemBuffer(StringRef((const char*)pIR), "");
-+
-+    SMDiagnostic Err;
-+    std::unique_ptr<Module> newModule = parseIR(pMem.get()->getMemBufferRef(), Err, mContext);
-+
-+    if (newModule == nullptr)
-+    {
-+        SWR_ASSERT(0, "Parse failed! Check Err for details.");
-+        return false;
-+    }
-+
-+    mpCurrentModule = newModule.get();
-+#if defined(_WIN32)
-+    // Needed for MCJIT on windows
-+    Triple hostTriple(sys::getProcessTriple());
-+    hostTriple.setObjectFormat(Triple::ELF);
-+    newModule->setTargetTriple(hostTriple.getTriple());
-+#endif // _WIN32
-+
-+    mpExec->addModule(std::move(newModule));
-+    mIsModuleFinalized = false;
-+
-+    return true;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Dump function to file.
-+void JitManager::DumpToFile(Function *f, const char *fileName)
-+{
-+    if (KNOB_DUMP_SHADER_IR)
-+    {
-+#if defined(_WIN32)
-+        DWORD pid = GetCurrentProcessId();
-+        TCHAR procname[MAX_PATH];
-+        GetModuleFileName(NULL, procname, MAX_PATH);
-+        const char* pBaseName = strrchr(procname, '\\');
-+        std::stringstream outDir;
-+        outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
-+        CreateDirectory(outDir.str().c_str(), NULL);
-+#endif
-+
-+        std::error_code EC;
-+        const char *funcName = f->getName().data();
-+        char fName[256];
-+#if defined(_WIN32)
-+        sprintf(fName, "%s\\%s.%s.ll", outDir.str().c_str(), funcName, fileName);
-+#else
-+        sprintf(fName, "%s.%s.ll", funcName, fileName);
-+#endif
-+        raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None);
-+        Module* pModule = f->getParent();
-+        pModule->print(fd, nullptr);
-+        fd.flush();
-+    }
-+}
-+
-+extern "C"
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Create JIT context.
-+    /// @param simdWidth - SIMD width to be used in generated program.
-+    HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch)
-+    {
-+        return new JitManager(targetSimdWidth, arch);
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Destroy JIT context.
-+    void JITCALL JitDestroyContext(HANDLE hJitContext)
-+    {
-+        delete reinterpret_cast<JitManager*>(hJitContext);
-+    }
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
-new file mode 100644
-index 0000000..e0e8ec4
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
-@@ -0,0 +1,182 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file JitManager.h
-+*
-+* @brief JitManager contains the LLVM data structures used for JIT generation
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/os.h"
-+#include "common/isa.hpp"
-+
-+#if defined(_WIN32)
-+#pragma warning(disable : 4146 4244 4267 4800 4996)
-+#endif
-+
-+#include "llvm/IR/DataLayout.h"
-+#include "llvm/IR/Instructions.h"
-+#include "llvm/IR/LLVMContext.h"
-+#include "llvm/IR/Module.h"
-+#include "llvm/IR/Type.h"
-+#include "llvm/IR/IRBuilder.h"
-+#include "llvm/IR/IntrinsicInst.h"
-+
-+#include "llvm/Config/llvm-config.h"
-+#ifndef LLVM_VERSION_MAJOR
-+#include "llvm/Config/config.h"
-+#endif
-+
-+#include "llvm/IR/Verifier.h"
-+#include "llvm/ExecutionEngine/MCJIT.h"
-+#include "llvm/Support/FileSystem.h"
-+#define LLVM_F_NONE sys::fs::F_None
-+
-+#include "llvm/Analysis/Passes.h"
-+#include "llvm/PassManager.h"
-+#include "llvm/CodeGen/Passes.h"
-+#include "llvm/ExecutionEngine/ExecutionEngine.h"
-+#include "llvm/Support/raw_ostream.h"
-+#include "llvm/Support/TargetSelect.h"
-+#include "llvm/Transforms/IPO.h"
-+#include "llvm/Transforms/Scalar.h"
-+#include "llvm/Support/Host.h"
-+
-+
-+using namespace llvm;
-+//////////////////////////////////////////////////////////////////////////
-+/// JitInstructionSet
-+/// @brief Subclass of InstructionSet that allows users to override
-+/// the reporting of support for certain ISA features.  This allows capping
-+/// the jitted code to a certain feature level, e.g. jit AVX level code on 
-+/// a platform that supports AVX2.
-+//////////////////////////////////////////////////////////////////////////
-+class JitInstructionSet : public InstructionSet
-+{
-+public:
-+    JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa)
-+    {
-+        if (isaRequest == "")
-+        {
-+            // Check for an environment variable
-+            const char* pIsaEnv = getenv("RASTY_KNOB_ARCH_STR");
-+            if (pIsaEnv)
-+            {
-+                isaRequest = pIsaEnv;
-+            }
-+        }
-+        std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower);
-+
-+        if(isaRequest == "avx")
-+        {
-+            bForceAVX = true;
-+            bForceAVX2 = false;
-+            bForceAVX512 = false;
-+        }
-+        else if(isaRequest == "avx2")
-+        {
-+            bForceAVX = false;
-+            bForceAVX2 = true;
-+            bForceAVX512 = false;
-+        }
-+        #if 0
-+        else if(isaRequest == "avx512")
-+        {
-+            bForceAVX = false;
-+            bForceAVX2 = false;
-+            bForceAVX512 = true;
-+        }
-+        #endif
-+    };
-+
-+    bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
-+    bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); }
-+    bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); }
-+
-+private:
-+    bool bForceAVX = false;
-+    bool bForceAVX2 = false;
-+    bool bForceAVX512 = false;
-+    std::string isaRequest;
-+};
-+
-+
-+
-+struct JitLLVMContext : LLVMContext
-+{
-+};
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// JitManager
-+//////////////////////////////////////////////////////////////////////////
-+struct JitManager
-+{
-+    JitManager(uint32_t w, const char *arch);
-+    ~JitManager(){};
-+
-+    JitLLVMContext          mContext;   ///< LLVM compiler
-+    IRBuilder<>             mBuilder;   ///< LLVM IR Builder
-+    ExecutionEngine*        mpExec;
-+
-+    // Need to be rebuilt after a JIT and before building new IR
-+    Module* mpCurrentModule;
-+    bool mIsModuleFinalized;
-+    uint32_t mJitNumber;
-+
-+    uint32_t                 mVWidth;
-+
-+    // Built in types.
-+    Type*                mInt8Ty;
-+    Type*                mInt32Ty;
-+    Type*                mInt64Ty;
-+    Type*                mFP32Ty;
-+    StructType*          mV4FP32Ty;
-+    StructType*          mV4Int32Ty;
-+
-+    // helper scalar function types
-+    FunctionType* mUnaryFPTy;
-+    FunctionType* mBinaryFPTy;
-+    FunctionType* mTrinaryFPTy;
-+    FunctionType* mUnaryIntTy;
-+    FunctionType* mBinaryIntTy;
-+    FunctionType* mTrinaryIntTy;
-+
-+    Type* mSimtFP32Ty;
-+    Type* mSimtInt32Ty;
-+
-+    Type* mSimdVectorInt32Ty;
-+    Type* mSimdVectorTy;
-+
-+    // fetch shader types
-+    FunctionType*        mFetchShaderTy;
-+
-+    JitInstructionSet mArch;
-+
-+    void SetupNewModule();
-+    bool SetupModuleFromIR(const uint8_t *pIR);
-+
-+    static void DumpToFile(Function *f, const char *fileName);
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
-new file mode 100644
-index 0000000..5e8e5f4
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
-@@ -0,0 +1,473 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file blend_jit.cpp
-+*
-+* @brief Implementation of the blend jitter
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#include "jit_api.h"
-+#include "blend_jit.h"
-+#include "builder.h"
-+#include "state_llvm.h"
-+#include "common/containers.hpp"
-+#include "llvm/IR/DataLayout.h"
-+
-+#include <sstream>
-+
-+// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
-+#define QUANTIZE_THRESHOLD 2
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Interface to Jitting a blend shader
-+//////////////////////////////////////////////////////////////////////////
-+struct BlendJit : public Builder
-+{
-+    BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
-+
-+    template<bool Color, bool Alpha>
-+    void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
-+    {
-+        Value* out[4];
-+
-+        switch (factor)
-+        {
-+        case BLENDFACTOR_ONE:
-+            out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
-+            break;
-+        case BLENDFACTOR_SRC_COLOR:
-+            out[0] = src[0];
-+            out[1] = src[1];
-+            out[2] = src[2];
-+            out[3] = src[3];
-+            break;
-+        case BLENDFACTOR_SRC_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = src[3];
-+            break;
-+        case BLENDFACTOR_DST_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = dst[3];
-+            break;
-+        case BLENDFACTOR_DST_COLOR:
-+            out[0] = dst[0];
-+            out[1] = dst[1];
-+            out[2] = dst[2];
-+            out[3] = dst[3];
-+            break;
-+        case BLENDFACTOR_SRC_ALPHA_SATURATE:
-+            out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
-+            out[3] = VIMMED1(1.0f);
-+            break;
-+        case BLENDFACTOR_CONST_COLOR:
-+            out[0] = constColor[0];
-+            out[1] = constColor[1];
-+            out[2] = constColor[2];
-+            out[3] = constColor[3];
-+            break;
-+        case BLENDFACTOR_CONST_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = constColor[3];
-+            break;
-+        case BLENDFACTOR_SRC1_COLOR:
-+            out[0] = src1[0];
-+            out[1] = src1[1];
-+            out[2] = src1[2];
-+            out[3] = src1[3];
-+            break;
-+        case BLENDFACTOR_SRC1_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = src1[3];
-+            break;
-+        case BLENDFACTOR_ZERO:
-+            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
-+            break;
-+        case BLENDFACTOR_INV_SRC_COLOR:
-+            out[0] = FSUB(VIMMED1(1.0f), src[0]);
-+            out[1] = FSUB(VIMMED1(1.0f), src[1]);
-+            out[2] = FSUB(VIMMED1(1.0f), src[2]);
-+            out[3] = FSUB(VIMMED1(1.0f), src[3]);
-+            break;
-+        case BLENDFACTOR_INV_SRC_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
-+            break;
-+        case BLENDFACTOR_INV_DST_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
-+            break;
-+        case BLENDFACTOR_INV_DST_COLOR:
-+            out[0] = FSUB(VIMMED1(1.0f), dst[0]);
-+            out[1] = FSUB(VIMMED1(1.0f), dst[1]);
-+            out[2] = FSUB(VIMMED1(1.0f), dst[2]);
-+            out[3] = FSUB(VIMMED1(1.0f), dst[3]);
-+            break;
-+        case BLENDFACTOR_INV_CONST_COLOR:
-+            out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
-+            out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
-+            out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
-+            out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
-+            break;
-+        case BLENDFACTOR_INV_CONST_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
-+            break;
-+        case BLENDFACTOR_INV_SRC1_COLOR:
-+            out[0] = FSUB(VIMMED1(1.0f), src1[0]);
-+            out[1] = FSUB(VIMMED1(1.0f), src1[1]);
-+            out[2] = FSUB(VIMMED1(1.0f), src1[2]);
-+            out[3] = FSUB(VIMMED1(1.0f), src1[3]);
-+            break;
-+        case BLENDFACTOR_INV_SRC1_ALPHA:
-+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
-+            break;
-+        default:
-+            SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
-+            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
-+            break;
-+        }
-+
-+        if (Color)
-+        {
-+            result[0] = out[0];
-+            result[1] = out[1];
-+            result[2] = out[2];
-+        }
-+
-+        if (Alpha)
-+        {
-+            result[3] = out[3];
-+        }
-+    }
-+
-+    void Clamp(SWR_FORMAT format, Value* src[4])
-+    {
-+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-+        SWR_TYPE type = info.type[0];
-+
-+        switch (type)
-+        {
-+        case SWR_TYPE_FLOAT:
-+            break;
-+
-+        case SWR_TYPE_UNORM:
-+            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
-+            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
-+            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
-+            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
-+            break;
-+
-+        case SWR_TYPE_SNORM:
-+            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
-+            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
-+            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
-+            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
-+            break;
-+
-+        default: SWR_ASSERT(false, "Unsupport format type: %d", type);
-+        }
-+    }
-+
-+    void ApplyDefaults(SWR_FORMAT format, Value* src[4])
-+    {
-+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-+
-+        bool valid[] = { false, false, false, false };
-+        for (uint32_t c = 0; c < info.numComps; ++c)
-+        {
-+            valid[info.swizzle[c]] = true;
-+        }
-+
-+        for (uint32_t c = 0; c < 4; ++c)
-+        {
-+            if (!valid[c])
-+            {
-+                src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
-+            }
-+        }
-+    }
-+
-+    void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
-+    {
-+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-+
-+        for (uint32_t c = 0; c < info.numComps; ++c)
-+        {
-+            if (info.type[c] == SWR_TYPE_UNUSED)
-+            {
-+                src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
-+            }
-+        }
-+    }
-+
-+    void Quantize(SWR_FORMAT format, Value* src[4])
-+    {
-+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-+        for (uint32_t c = 0; c < info.numComps; ++c)
-+        {
-+            if (info.bpc[c] <= QUANTIZE_THRESHOLD)
-+            {
-+                uint32_t swizComp = info.swizzle[c];
-+                float factor = (float)((1 << info.bpc[c]) - 1);
-+                switch (info.type[c])
-+                {
-+                case SWR_TYPE_UNORM:
-+                    src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
-+                    src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
-+                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
-+                    break;
-+                default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
-+                }
-+            }
-+        }
-+    }
-+
-+    template<bool Color, bool Alpha>
-+    void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
-+    {
-+        Value* out[4];
-+        Value* srcBlend[4];
-+        Value* dstBlend[4];
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            srcBlend[i] = FMUL(src[i], srcFactor[i]);
-+            dstBlend[i] = FMUL(dst[i], dstFactor[i]);
-+        }
-+
-+        switch (blendOp)
-+        {
-+        case BLENDOP_ADD:
-+            out[0] = FADD(srcBlend[0], dstBlend[0]);
-+            out[1] = FADD(srcBlend[1], dstBlend[1]);
-+            out[2] = FADD(srcBlend[2], dstBlend[2]);
-+            out[3] = FADD(srcBlend[3], dstBlend[3]);
-+            break;
-+
-+        case BLENDOP_SUBTRACT:
-+            out[0] = FSUB(srcBlend[0], dstBlend[0]);
-+            out[1] = FSUB(srcBlend[1], dstBlend[1]);
-+            out[2] = FSUB(srcBlend[2], dstBlend[2]);
-+            out[3] = FSUB(srcBlend[3], dstBlend[3]);
-+            break;
-+
-+        case BLENDOP_REVSUBTRACT:
-+            out[0] = FSUB(dstBlend[0], srcBlend[0]);
-+            out[1] = FSUB(dstBlend[1], srcBlend[1]);
-+            out[2] = FSUB(dstBlend[2], srcBlend[2]);
-+            out[3] = FSUB(dstBlend[3], srcBlend[3]);
-+            break;
-+
-+        case BLENDOP_MIN:
-+            out[0] = VMINPS(src[0], dst[0]);
-+            out[1] = VMINPS(src[1], dst[1]);
-+            out[2] = VMINPS(src[2], dst[2]);
-+            out[3] = VMINPS(src[3], dst[3]);
-+            break;
-+
-+        case BLENDOP_MAX:
-+            out[0] = VMAXPS(src[0], dst[0]);
-+            out[1] = VMAXPS(src[1], dst[1]);
-+            out[2] = VMAXPS(src[2], dst[2]);
-+            out[3] = VMAXPS(src[3], dst[3]);
-+            break;
-+
-+        default:
-+            SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
-+            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
-+            break;
-+        }
-+
-+        if (Color)
-+        {
-+            result[0] = out[0];
-+            result[1] = out[1];
-+            result[2] = out[2];
-+        }
-+
-+        if (Alpha)
-+        {
-+            result[3] = out[3];
-+        }
-+    }
-+
-+    Function* Create(const BLEND_COMPILE_STATE& state)
-+    {
-+        static std::size_t jitNum = 0;
-+
-+        std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-+        fnName << jitNum++;
-+
-+        // blend function signature
-+        // typedef void(*PFN_BLEND_JIT_FUNC)(SWR_BLEND_STATE*, simdvector&, simdvector&, uint8_t*, simdvector&);
-+
-+        std::vector<Type*> args{
-+            PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
-+            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
-+            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
-+            PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
-+            PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
-+        };
-+
-+        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
-+        Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
-+
-+        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
-+
-+        IRB()->SetInsertPoint(entry);
-+
-+        // arguments
-+        auto argitr = blendFunc->getArgumentList().begin();
-+        Value* pBlendState = argitr++;
-+        pBlendState->setName("pBlendState");
-+        Value* pSrc = argitr++;
-+        pSrc->setName("src");
-+        Value* pSrc1 = argitr++;
-+        pSrc1->setName("src1");
-+        Value* pDst = argitr++;
-+        pDst->setName("pDst");
-+        Value* pResult = argitr++;
-+        pResult->setName("result");
-+
-+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-+        Value* dst[4];
-+        Value* constantColor[4];
-+        Value* src[4];
-+        Value* src1[4];
-+        Value* result[4];
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            // load hot tile
-+            dst[i] = LOAD(pDst, { i });
-+
-+            // load constant color
-+            constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
-+
-+            // load src
-+            src[i] = LOAD(pSrc, { i });
-+
-+            // load src1
-+            src1[i] = LOAD(pSrc1, { i });
-+        }
-+
-+        // clamp sources
-+        Clamp(state.format, src);
-+        Clamp(state.format, src1);
-+        Clamp(state.format, dst);
-+        Clamp(state.format, constantColor);
-+
-+        // apply defaults to hottile contents to take into account missing components
-+        ApplyDefaults(state.format, dst);
-+
-+        // Force defaults for unused 'X' components
-+        ApplyUnusedDefaults(state.format, dst);
-+
-+        // Quantize low precision components
-+        Quantize(state.format, dst);
-+
-+        // special case clamping for R11G11B10_float which has no sign bit
-+        if (state.format == R11G11B10_FLOAT)
-+        {
-+            dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
-+            dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
-+            dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
-+            dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
-+        }
-+
-+        Value* srcFactor[4];
-+        Value* dstFactor[4];
-+        if (state.independentAlphaBlendEnable)
-+        {
-+            GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
-+            GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
-+
-+            GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
-+            GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
-+
-+            BlendFunc<true, false>((SWR_BLEND_OP)state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-+            BlendFunc<false, true>((SWR_BLEND_OP)state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
-+        }
-+        else
-+        {
-+            GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
-+            GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
-+
-+            BlendFunc<true, true>((SWR_BLEND_OP)state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-+        }
-+
-+        // store results out
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            STORE(result[i], pResult, { i });
-+        }
-+
-+        RET_VOID();
-+
-+        JitManager::DumpToFile(blendFunc, "");
-+
-+        FunctionPassManager passes(JM()->mpCurrentModule);
-+        passes.add(createBreakCriticalEdgesPass());
-+        passes.add(createCFGSimplificationPass());
-+        passes.add(createEarlyCSEPass());
-+        passes.add(createPromoteMemoryToRegisterPass());
-+        passes.add(createCFGSimplificationPass());
-+        passes.add(createEarlyCSEPass());
-+        passes.add(createInstructionCombiningPass());
-+        passes.add(createInstructionSimplifierPass());
-+        passes.add(createConstantPropagationPass());
-+        passes.add(createSCCPPass());
-+        passes.add(createAggressiveDCEPass());
-+
-+        passes.run(*blendFunc);
-+
-+        JitManager::DumpToFile(blendFunc, "optimized");
-+
-+        return blendFunc;
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JITs from fetch shader IR
-+/// @param hJitMgr - JitManager handle
-+/// @param func   - LLVM function IR
-+/// @return PFN_FETCH_FUNC - pointer to fetch code
-+PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
-+{
-+    const llvm::Function *func = (const llvm::Function*)hFunc;
-+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-+    PFN_BLEND_JIT_FUNC pfnBlend;
-+    pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
-+    pJitMgr->mIsModuleFinalized = true;
-+
-+    return pfnBlend;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT compiles blend shader
-+/// @param hJitMgr - JitManager handle
-+/// @param state   - blend state to build function from
-+extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
-+{
-+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-+
-+    pJitMgr->SetupNewModule();
-+
-+    BlendJit theJit(pJitMgr);
-+    HANDLE hFunc = theJit.Create(state);
-+
-+    return JitBlendFunc(hJitMgr, hFunc);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
-new file mode 100644
-index 0000000..80c4c03
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
-@@ -0,0 +1,49 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file blend_jit.h
-+*
-+* @brief Definition of the blend jitter
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/formats.h"
-+#include "core/context.h"
-+#include "core/state.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// State required for blend jit
-+//////////////////////////////////////////////////////////////////////////
-+struct BLEND_COMPILE_STATE
-+{
-+    SWR_FORMAT format;          // format of render target being blended
-+    bool independentAlphaBlendEnable;
-+    SWR_RENDER_TARGET_BLEND_STATE blendState;
-+
-+    bool operator==(const BLEND_COMPILE_STATE& other) const
-+    {
-+        return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0;
-+    }
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
-new file mode 100644
-index 0000000..b971791
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
-@@ -0,0 +1,56 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder.h
-+* 
-+* @brief Includes all the builder related functionality
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+
-+#include "builder.h"
-+
-+using namespace llvm;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Contructor for Builder.
-+/// @param pJitMgr - JitManager which contains modules, function passes, etc.
-+Builder::Builder(JitManager *pJitMgr)
-+    : mpJitMgr(pJitMgr)
-+{
-+    mpIRBuilder = &pJitMgr->mBuilder;
-+
-+    mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
-+    mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
-+    mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
-+    mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
-+    mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
-+    mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
-+    mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
-+    mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
-+    mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
-+    mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
-+    mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
-+    mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
-+    mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
-new file mode 100644
-index 0000000..1342f28
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
-@@ -0,0 +1,66 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder.h
-+* 
-+* @brief Includes all the builder related functionality
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+#include "JitManager.h"
-+#include "common/formats.h"
-+
-+using namespace llvm;
-+
-+struct Builder
-+{
-+    Builder(JitManager *pJitMgr);
-+    IRBuilder<>* IRB() { return mpIRBuilder; };
-+    JitManager* JM() { return mpJitMgr; }
-+
-+    JitManager* mpJitMgr;
-+    IRBuilder<>* mpIRBuilder;
-+
-+    // Built in types.
-+    Type*                mInt8Ty;
-+    Type*                mInt16Ty;
-+    Type*                mInt32Ty;
-+    Type*                mInt64Ty;
-+    Type*                mFP16Ty;
-+    Type*                mFP32Ty;
-+    Type*                mSimdFP16Ty;
-+    Type*                mSimdFP32Ty;
-+    Type*                mSimdInt16Ty;
-+    Type*                mSimdInt32Ty;
-+    Type*                mSimdInt64Ty;
-+    StructType*          mV4FP32Ty;
-+    StructType*          mV4Int32Ty;
-+
-+#include "builder_gen.h"
-+#include "builder_x86.h"
-+#include "builder_misc.h"
-+#include "builder_math.h"
-+
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp
-new file mode 100644
-index 0000000..7b5ef20
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp
-@@ -0,0 +1,1052 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder_gen.cpp
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#include "builder.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::GLOBAL_STRING(StringRef Str, const Twine &Name)
-+{
-+   return IRB()->CreateGlobalString(Str, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag)
-+{
-+   return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag)
-+{
-+   return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MEMCPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag)
-+{
-+   return IRB()->CreateMemCpy(Dst, Src, Size, Align, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MEMCPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag)
-+{
-+   return IRB()->CreateMemCpy(Dst, Src, Size, Align, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag)
-+{
-+   return IRB()->CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag)
-+{
-+   return IRB()->CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::LIFETIME_START(Value *Ptr, ConstantInt *Size)
-+{
-+   return IRB()->CreateLifetimeStart(Ptr, Size);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::LIFETIME_END(Value *Ptr, ConstantInt *Size)
-+{
-+   return IRB()->CreateLifetimeEnd(Ptr, Size);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru, const Twine &Name)
-+{
-+   return IRB()->CreateMaskedLoad(Ptr, Align, Mask, PassThru, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask)
-+{
-+   return IRB()->CreateMaskedStore(Val, Ptr, Align, Mask);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::ASSUMPTION(Value *Cond)
-+{
-+   return IRB()->CreateAssumption(Cond);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::GC_STATEPOINT(Value *ActualCallee, ArrayRef<Value*> CallArgs, ArrayRef<Value*> DeoptArgs, ArrayRef<Value*> GCArgs, const Twine &Name)
-+{
-+   return IRB()->CreateGCStatepoint(ActualCallee, CallArgs, DeoptArgs, GCArgs, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name)
-+{
-+   return IRB()->CreateGCResult(Statepoint, ResultType, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name)
-+{
-+   return IRB()->CreateGCRelocate(Statepoint, BaseOffset, DerivedOffset, ResultType, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+ReturnInst *Builder::RET_VOID()
-+{
-+   return IRB()->CreateRetVoid();
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+ReturnInst *Builder::RET(Value *V)
-+{
-+   return IRB()->CreateRet(V);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+ReturnInst *Builder::AGGREGATE_RET(Value *const *retVals, unsigned N)
-+{
-+   return IRB()->CreateAggregateRet(retVals, N);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+BranchInst *Builder::BR(BasicBlock *Dest)
-+{
-+   return IRB()->CreateBr(Dest);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+BranchInst *Builder::COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights)
-+{
-+   return IRB()->CreateCondBr(Cond, True, False, BranchWeights);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+SwitchInst *Builder::SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases, MDNode *BranchWeights)
-+{
-+   return IRB()->CreateSwitch(V, Dest, NumCases, BranchWeights);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+IndirectBrInst *Builder::INDIRECT_BR(Value *Addr, unsigned NumDests)
-+{
-+   return IRB()->CreateIndirectBr(Addr, NumDests);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, const Twine &Name)
-+{
-+   return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, const Twine &Name)
-+{
-+   return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Arg1, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+InvokeInst *Builder::INVOKE3(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name)
-+{
-+   return IRB()->CreateInvoke3(Callee, NormalDest, UnwindDest, Arg1, Arg2, Arg3, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Value *> Args, const Twine &Name)
-+{
-+   return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Args, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+ResumeInst *Builder::RESUME(Value *Exn)
-+{
-+   return IRB()->CreateResume(Exn);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+UnreachableInst *Builder::UNREACHABLE()
-+{
-+   return IRB()->CreateUnreachable();
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ADD(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW)
-+{
-+   return IRB()->CreateAdd(LHS, RHS, Name, HasNUW, HasNSW);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NSW_ADD(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateNSWAdd(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NUW_ADD(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateNUWAdd(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FADD(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag)
-+{
-+   return IRB()->CreateFAdd(LHS, RHS, Name, FPMathTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SUB(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW)
-+{
-+   return IRB()->CreateSub(LHS, RHS, Name, HasNUW, HasNSW);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NSW_SUB(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateNSWSub(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NUW_SUB(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateNUWSub(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FSUB(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag)
-+{
-+   return IRB()->CreateFSub(LHS, RHS, Name, FPMathTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::MUL(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW)
-+{
-+   return IRB()->CreateMul(LHS, RHS, Name, HasNUW, HasNSW);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NSW_MUL(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateNSWMul(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NUW_MUL(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateNUWMul(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FMUL(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag)
-+{
-+   return IRB()->CreateFMul(LHS, RHS, Name, FPMathTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::UDIV(Value *LHS, Value *RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateUDiv(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateExactUDiv(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SDIV(Value *LHS, Value *RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateSDiv(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateExactSDiv(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FDIV(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag)
-+{
-+   return IRB()->CreateFDiv(LHS, RHS, Name, FPMathTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::UREM(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateURem(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SREM(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateSRem(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FREM(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag)
-+{
-+   return IRB()->CreateFRem(LHS, RHS, Name, FPMathTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SHL(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW)
-+{
-+   return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SHL(Value *LHS, const APInt &RHS, const Twine &Name, bool HasNUW, bool HasNSW)
-+{
-+   return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SHL(Value *LHS, uint64_t RHS, const Twine &Name, bool HasNUW, bool HasNSW)
-+{
-+   return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::LSHR(Value *LHS, Value *RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateLShr(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::LSHR(Value *LHS, const APInt &RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateLShr(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::LSHR(Value *LHS, uint64_t RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateLShr(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ASHR(Value *LHS, Value *RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateAShr(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ASHR(Value *LHS, const APInt &RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateAShr(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ASHR(Value *LHS, uint64_t RHS, const Twine &Name, bool isExact)
-+{
-+   return IRB()->CreateAShr(LHS, RHS, Name, isExact);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::AND(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateAnd(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::AND(Value *LHS, const APInt &RHS, const Twine &Name)
-+{
-+   return IRB()->CreateAnd(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::AND(Value *LHS, uint64_t RHS, const Twine &Name)
-+{
-+   return IRB()->CreateAnd(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::OR(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateOr(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::OR(Value *LHS, const APInt &RHS, const Twine &Name)
-+{
-+   return IRB()->CreateOr(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::OR(Value *LHS, uint64_t RHS, const Twine &Name)
-+{
-+   return IRB()->CreateOr(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::XOR(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateXor(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::XOR(Value *LHS, const APInt &RHS, const Twine &Name)
-+{
-+   return IRB()->CreateXor(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::XOR(Value *LHS, uint64_t RHS, const Twine &Name)
-+{
-+   return IRB()->CreateXor(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag)
-+{
-+   return IRB()->CreateBinOp(Opc, LHS, RHS, Name, FPMathTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NEG(Value *V, const Twine &Name, bool HasNUW, bool HasNSW)
-+{
-+   return IRB()->CreateNeg(V, Name, HasNUW, HasNSW);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NSW_NEG(Value *V, const Twine &Name)
-+{
-+   return IRB()->CreateNSWNeg(V, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NUW_NEG(Value *V, const Twine &Name)
-+{
-+   return IRB()->CreateNUWNeg(V, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FNEG(Value *V, const Twine &Name, MDNode *FPMathTag)
-+{
-+   return IRB()->CreateFNeg(V, Name, FPMathTag);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::NOT(Value *V, const Twine &Name)
-+{
-+   return IRB()->CreateNot(V, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+AllocaInst *Builder::ALLOCA(Type *Ty, Value *ArraySize, const Twine &Name)
-+{
-+   return IRB()->CreateAlloca(Ty, ArraySize, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+LoadInst *Builder::LOAD(Value *Ptr, const char *Name)
-+{
-+   return IRB()->CreateLoad(Ptr, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+LoadInst *Builder::LOAD(Value *Ptr, const Twine &Name)
-+{
-+   return IRB()->CreateLoad(Ptr, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+LoadInst *Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name)
-+{
-+   return IRB()->CreateLoad(Ptr, isVolatile, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+StoreInst *Builder::STORE(Value *Val, Value *Ptr, bool isVolatile)
-+{
-+   return IRB()->CreateStore(Val, Ptr, isVolatile);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name)
-+{
-+   return IRB()->CreateAlignedLoad(Ptr, Align, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name)
-+{
-+   return IRB()->CreateAlignedLoad(Ptr, Align, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name)
-+{
-+   return IRB()->CreateAlignedLoad(Ptr, Align, isVolatile, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+StoreInst *Builder::ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile)
-+{
-+   return IRB()->CreateAlignedStore(Val, Ptr, Align, isVolatile);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+FenceInst *Builder::FENCE(AtomicOrdering Ordering, SynchronizationScope SynchScope, const Twine &Name)
-+{
-+   return IRB()->CreateFence(Ordering, SynchScope, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+AtomicCmpXchgInst *Builder::ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope)
-+{
-+   return IRB()->CreateAtomicCmpXchg(Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SynchScope);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+AtomicRMWInst *Builder::ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SynchronizationScope SynchScope)
-+{
-+   return IRB()->CreateAtomicRMW(Op, Ptr, Val, Ordering, SynchScope);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
-+{
-+   return IRB()->CreateGEP(Ptr, IdxList, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::IN_BOUNDS_GEP(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
-+{
-+   return IRB()->CreateInBoundsGEP(Ptr, IdxList, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::GEP(Value *Ptr, Value *Idx, const Twine &Name)
-+{
-+   return IRB()->CreateGEP(Ptr, Idx, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::IN_BOUNDS_GEP(Value *Ptr, Value *Idx, const Twine &Name)
-+{
-+   return IRB()->CreateInBoundsGEP(Ptr, Idx, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name)
-+{
-+   return IRB()->CreateConstGEP1_32(Ptr, Idx0, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_IN_BOUNDS_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name)
-+{
-+   return IRB()->CreateConstInBoundsGEP1_32(Ptr, Idx0, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name)
-+{
-+   return IRB()->CreateConstGEP2_32(Ptr, Idx0, Idx1, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_IN_BOUNDS_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name)
-+{
-+   return IRB()->CreateConstInBoundsGEP2_32(Ptr, Idx0, Idx1, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name)
-+{
-+   return IRB()->CreateConstGEP1_64(Ptr, Idx0, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name)
-+{
-+   return IRB()->CreateConstInBoundsGEP1_64(Ptr, Idx0, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name)
-+{
-+   return IRB()->CreateConstGEP2_64(Ptr, Idx0, Idx1, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name)
-+{
-+   return IRB()->CreateConstInBoundsGEP2_64(Ptr, Idx0, Idx1, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::STRUCT_GEP(Value *Ptr, unsigned Idx, const Twine &Name)
-+{
-+   return IRB()->CreateStructGEP(Ptr, Idx, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::GLOBAL_STRING_PTR(StringRef Str, const Twine &Name)
-+{
-+   return IRB()->CreateGlobalStringPtr(Str, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::TRUNC(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateTrunc(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::Z_EXT(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateZExt(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::S_EXT(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateSExt(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateZExtOrTrunc(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateSExtOrTrunc(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FP_TO_UI(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateFPToUI(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FP_TO_SI(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateFPToSI(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::UI_TO_FP(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateUIToFP(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SI_TO_FP(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateSIToFP(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FP_TRUNC(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateFPTrunc(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FP_EXT(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateFPExt(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreatePtrToInt(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateIntToPtr(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::BITCAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateBitCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateAddrSpaceCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateZExtOrBitCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateSExtOrBitCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateTruncOrBitCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateCast(Op, V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::POINTER_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreatePointerCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreatePointerBitCastOrAddrSpaceCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name)
-+{
-+   return IRB()->CreateIntCast(V, DestTy, isSigned, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateBitOrPointerCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FP_CAST(Value *V, Type *DestTy, const Twine &Name)
-+{
-+   return IRB()->CreateFPCast(V, DestTy, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpEQ(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_NE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpNE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpUGT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpUGE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpULT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpULE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpSGT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpSGE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpSLT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmpSLE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpOEQ(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpOGT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpOGE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpOLT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpOLE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpONE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpORD(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpUNO(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpUEQ(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpUGT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpUGE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpULT(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpULE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmpUNE(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateICmp(P, LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreateFCmp(P, LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+PHINode *Builder::PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name)
-+{
-+   return IRB()->CreatePHI(Ty, NumReservedValues, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::CALL(Value *Callee, const Twine &Name)
-+{
-+   return IRB()->CreateCall(Callee, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::CALL(Value *Callee, Value *Arg, const Twine &Name)
-+{
-+   return IRB()->CreateCall(Callee, Arg, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::CALL2(Value *Callee, Value *Arg1, Value *Arg2, const Twine &Name)
-+{
-+   return IRB()->CreateCall2(Callee, Arg1, Arg2, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::CALL3(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name)
-+{
-+   return IRB()->CreateCall3(Callee, Arg1, Arg2, Arg3, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::CALL4(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, const Twine &Name)
-+{
-+   return IRB()->CreateCall4(Callee, Arg1, Arg2, Arg3, Arg4, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::CALL5(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, Value *Arg5, const Twine &Name)
-+{
-+   return IRB()->CreateCall5(Callee, Arg1, Arg2, Arg3, Arg4, Arg5, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::CALLA(Value *Callee, ArrayRef<Value *> Args, const Twine &Name)
-+{
-+   return IRB()->CreateCall(Callee, Args, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::SELECT(Value *C, Value *True, Value *False, const Twine &Name)
-+{
-+   return IRB()->CreateSelect(C, True, False, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+VAArgInst *Builder::VA_ARG(Value *List, Type *Ty, const Twine &Name)
-+{
-+   return IRB()->CreateVAArg(List, Ty, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VEXTRACT(Value *Vec, Value *Idx, const Twine &Name)
-+{
-+   return IRB()->CreateExtractElement(Vec, Idx, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name)
-+{
-+   return IRB()->CreateInsertElement(Vec, NewElt, Idx, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name)
-+{
-+   return IRB()->CreateShuffleVector(V1, V2, Mask, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::EXTRACT_VALUE(Value *Agg, ArrayRef<unsigned> Idxs, const Twine &Name)
-+{
-+   return IRB()->CreateExtractValue(Agg, Idxs, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::INSERT_VALUE(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, const Twine &Name)
-+{
-+   return IRB()->CreateInsertValue(Agg, Val, Idxs, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+LandingPadInst *Builder::LANDING_PAD(Type *Ty, Value *PersFn, unsigned NumClauses, const Twine &Name)
-+{
-+   return IRB()->CreateLandingPad(Ty, PersFn, NumClauses, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::IS_NULL(Value *Arg, const Twine &Name)
-+{
-+   return IRB()->CreateIsNull(Arg, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::IS_NOT_NULL(Value *Arg, const Twine &Name)
-+{
-+   return IRB()->CreateIsNotNull(Arg, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name)
-+{
-+   return IRB()->CreatePtrDiff(LHS, RHS, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name)
-+{
-+   return IRB()->CreateVectorSplat(NumElts, V, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name)
-+{
-+   return IRB()->CreateExtractInteger(DL, From, ExtractedTy, Offset, Name);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+CallInst *Builder::ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue)
-+{
-+   return IRB()->CreateAlignmentAssumption(DL, PtrValue, Alignment, OffsetValue);
-+}
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h
-new file mode 100644
-index 0000000..c39077c
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h
-@@ -0,0 +1,205 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder_gen.h
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#pragma once
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Auto-generated Builder IR declarations
-+//////////////////////////////////////////////////////////////////////////
-+Value *GLOBAL_STRING(StringRef Str, const Twine &Name = "");
-+CallInst *MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
-+CallInst *MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
-+CallInst *MEMCPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
-+CallInst *MEMCPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
-+CallInst *MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
-+CallInst *MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr);
-+CallInst *LIFETIME_START(Value *Ptr, ConstantInt *Size = nullptr);
-+CallInst *LIFETIME_END(Value *Ptr, ConstantInt *Size = nullptr);
-+CallInst *MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru = 0, const Twine &Name = "");
-+CallInst *MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask);
-+CallInst *ASSUMPTION(Value *Cond);
-+CallInst *GC_STATEPOINT(Value *ActualCallee, ArrayRef<Value*> CallArgs, ArrayRef<Value*> DeoptArgs, ArrayRef<Value*> GCArgs, const Twine &Name = "");
-+CallInst *GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name = "");
-+CallInst *GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name = "");
-+ReturnInst *RET_VOID();
-+ReturnInst *RET(Value *V);
-+ReturnInst *AGGREGATE_RET(Value *const *retVals, unsigned N);
-+BranchInst *BR(BasicBlock *Dest);
-+BranchInst *COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights = nullptr);
-+SwitchInst *SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases = 10, MDNode *BranchWeights = nullptr);
-+IndirectBrInst *INDIRECT_BR(Value *Addr, unsigned NumDests = 10);
-+InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, const Twine &Name = "");
-+InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, const Twine &Name = "");
-+InvokeInst *INVOKE3(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name = "");
-+InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Value *> Args, const Twine &Name = "");
-+ResumeInst *RESUME(Value *Exn);
-+UnreachableInst *UNREACHABLE();
-+Value *ADD(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false);
-+Value *NSW_ADD(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *NUW_ADD(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FADD(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr);
-+Value *SUB(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false);
-+Value *NSW_SUB(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *NUW_SUB(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FSUB(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr);
-+Value *MUL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false);
-+Value *NSW_MUL(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *NUW_MUL(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FMUL(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr);
-+Value *UDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false);
-+Value *EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *SDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false);
-+Value *EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FDIV(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr);
-+Value *UREM(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *SREM(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FREM(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr);
-+Value *SHL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false);
-+Value *SHL(Value *LHS, const APInt &RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false);
-+Value *SHL(Value *LHS, uint64_t RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false);
-+Value *LSHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false);
-+Value *LSHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false);
-+Value *LSHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false);
-+Value *ASHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false);
-+Value *ASHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false);
-+Value *ASHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false);
-+Value *AND(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *AND(Value *LHS, const APInt &RHS, const Twine &Name = "");
-+Value *AND(Value *LHS, uint64_t RHS, const Twine &Name = "");
-+Value *OR(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *OR(Value *LHS, const APInt &RHS, const Twine &Name = "");
-+Value *OR(Value *LHS, uint64_t RHS, const Twine &Name = "");
-+Value *XOR(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *XOR(Value *LHS, const APInt &RHS, const Twine &Name = "");
-+Value *XOR(Value *LHS, uint64_t RHS, const Twine &Name = "");
-+Value *BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr);
-+Value *NEG(Value *V, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false);
-+Value *NSW_NEG(Value *V, const Twine &Name = "");
-+Value *NUW_NEG(Value *V, const Twine &Name = "");
-+Value *FNEG(Value *V, const Twine &Name = "", MDNode *FPMathTag = nullptr);
-+Value *NOT(Value *V, const Twine &Name = "");
-+AllocaInst *ALLOCA(Type *Ty, Value *ArraySize = nullptr, const Twine &Name = "");
-+LoadInst *LOAD(Value *Ptr, const char *Name);
-+LoadInst *LOAD(Value *Ptr, const Twine &Name = "");
-+LoadInst *LOAD(Value *Ptr, bool isVolatile, const Twine &Name = "");
-+StoreInst *STORE(Value *Val, Value *Ptr, bool isVolatile = false);
-+LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name);
-+LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name = "");
-+LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name = "");
-+StoreInst *ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false);
-+FenceInst *FENCE(AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread, const Twine &Name = "");
-+AtomicCmpXchgInst *ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope = CrossThread);
-+AtomicRMWInst *ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread);
-+Value *GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = "");
-+Value *IN_BOUNDS_GEP(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = "");
-+Value *GEP(Value *Ptr, Value *Idx, const Twine &Name = "");
-+Value *IN_BOUNDS_GEP(Value *Ptr, Value *Idx, const Twine &Name = "");
-+Value *CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = "");
-+Value *CONST_IN_BOUNDS_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = "");
-+Value *CONST_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = "");
-+Value *CONST_IN_BOUNDS_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = "");
-+Value *CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = "");
-+Value *CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = "");
-+Value *CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = "");
-+Value *CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = "");
-+Value *STRUCT_GEP(Value *Ptr, unsigned Idx, const Twine &Name = "");
-+Value *GLOBAL_STRING_PTR(StringRef Str, const Twine &Name = "");
-+Value *TRUNC(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *Z_EXT(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *S_EXT(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *FP_TO_UI(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *FP_TO_SI(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *UI_TO_FP(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *SI_TO_FP(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *FP_TRUNC(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *FP_EXT(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *BITCAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name = "");
-+Value *POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name = "");
-+Value *BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *FP_CAST(Value *V, Type *DestTy, const Twine &Name = "");
-+Value *ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_NE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = "");
-+PHINode *PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name = "");
-+CallInst *CALL(Value *Callee, const Twine &Name = "");
-+CallInst *CALL(Value *Callee, Value *Arg, const Twine &Name = "");
-+CallInst *CALL2(Value *Callee, Value *Arg1, Value *Arg2, const Twine &Name = "");
-+CallInst *CALL3(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name = "");
-+CallInst *CALL4(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, const Twine &Name = "");
-+CallInst *CALL5(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, Value *Arg5, const Twine &Name = "");
-+CallInst *CALLA(Value *Callee, ArrayRef<Value *> Args, const Twine &Name = "");
-+Value *SELECT(Value *C, Value *True, Value *False, const Twine &Name = "");
-+VAArgInst *VA_ARG(Value *List, Type *Ty, const Twine &Name = "");
-+Value *VEXTRACT(Value *Vec, Value *Idx, const Twine &Name = "");
-+Value *VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name = "");
-+Value *VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name = "");
-+Value *EXTRACT_VALUE(Value *Agg, ArrayRef<unsigned> Idxs, const Twine &Name = "");
-+Value *INSERT_VALUE(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, const Twine &Name = "");
-+LandingPadInst *LANDING_PAD(Type *Ty, Value *PersFn, unsigned NumClauses, const Twine &Name = "");
-+Value *IS_NULL(Value *Arg, const Twine &Name = "");
-+Value *IS_NOT_NULL(Value *Arg, const Twine &Name = "");
-+Value *PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name = "");
-+Value *VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name = "");
-+Value *EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name);
-+CallInst *ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue = nullptr);
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
-new file mode 100644
-index 0000000..92867ec
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
-@@ -0,0 +1,34 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder_math.h
-+* 
-+* @brief math/alu builder functions
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+Value* VLOG2PS(Value* src);
-+Value* VPOW24PS(Value* src);
-+Value* VEXP2PS(Value* src);
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
-new file mode 100644
-index 0000000..5897121
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
-@@ -0,0 +1,1195 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder_misc.cpp
-+* 
-+* @brief Implementation for miscellaneous builder functions
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#include "builder.h"
-+#include "llvm/Support/DynamicLibrary.h"
-+
-+void __cdecl CallPrint(const char* fmt, ...);
-+
-+Constant *Builder::C(bool i)
-+{
-+    return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
-+}
-+
-+Constant *Builder::C(char i)
-+{
-+    return ConstantInt::get(IRB()->getInt8Ty(), i);
-+}
-+
-+Constant *Builder::C(uint8_t i)
-+{
-+    return ConstantInt::get(IRB()->getInt8Ty(), i);
-+}
-+
-+Constant *Builder::C(int i)
-+{
-+    return ConstantInt::get(IRB()->getInt32Ty(), i);
-+}
-+
-+Constant *Builder::C(int64_t i)
-+{
-+    return ConstantInt::get(IRB()->getInt64Ty(), i);
-+}
-+
-+Constant *Builder::C(UINT16 i)
-+{
-+    return ConstantInt::get(mInt16Ty,i);
-+}
-+
-+Constant *Builder::C(uint32_t i)
-+{
-+    return ConstantInt::get(IRB()->getInt32Ty(), i);
-+}
-+
-+Constant *Builder::C(float i)
-+{
-+    return ConstantFP::get(IRB()->getFloatTy(), i);
-+}
-+
-+Constant *Builder::PRED(bool pred)
-+{
-+    return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
-+}
-+
-+Value *Builder::VIMMED1(int i)
-+{
-+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
-+}
-+
-+Value *Builder::VIMMED1(uint32_t i)
-+{
-+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
-+}
-+
-+Value *Builder::VIMMED1(float i)
-+{
-+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
-+}
-+
-+Value *Builder::VIMMED1(bool i)
-+{
-+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
-+}
-+
-+Value *Builder::VUNDEF_IPTR()
-+{
-+    return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
-+}
-+
-+Value *Builder::VUNDEF_I()
-+{
-+    return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
-+}
-+
-+Value *Builder::VUNDEF(Type *ty, uint32_t size)
-+{
-+    return UndefValue::get(VectorType::get(ty, size));
-+}
-+
-+Value *Builder::VUNDEF_F()
-+{
-+    return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
-+}
-+
-+Value *Builder::VUNDEF(Type* t)
-+{
-+    return UndefValue::get(VectorType::get(t, JM()->mVWidth));
-+}
-+
-+Value *Builder::VINSERT(Value *vec, Value *val, int index)
-+{
-+    return VINSERT(vec, val, C(index));
-+}
-+
-+Value *Builder::VBROADCAST(Value *src)
-+{
-+    // check if src is already a vector
-+    if (src->getType()->isVectorTy())
-+    {
-+        return src;
-+    }
-+
-+    Value *vecRet = VUNDEF(src->getType());
-+    vecRet = VINSERT(vecRet, src, 0);
-+    vecRet = VSHUFFLE(vecRet, vecRet, VIMMED1(0));
-+
-+    return vecRet;
-+}
-+
-+uint32_t Builder::IMMED(Value* v)
-+{
-+    SWR_ASSERT(isa<ConstantInt>(v));
-+    ConstantInt *pValConst = cast<ConstantInt>(v);
-+    return pValConst->getZExtValue();
-+}
-+
-+Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
-+{
-+    std::vector<Value*> indices;
-+    for (auto i : indexList)
-+        indices.push_back(i);
-+    return GEPA(ptr, indices);
-+}
-+
-+Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
-+{
-+    std::vector<Value*> indices;
-+    for (auto i : indexList)
-+        indices.push_back(C(i));
-+    return GEPA(ptr, indices);
-+}
-+
-+LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
-+{
-+    std::vector<Value*> valIndices;
-+    for (auto i : indices)
-+        valIndices.push_back(C(i));
-+    return LOAD(GEPA(basePtr, valIndices), name);
-+}
-+
-+LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
-+{
-+    std::vector<Value*> valIndices;
-+    for (auto i : indices)
-+        valIndices.push_back(i);
-+    return LOAD(GEPA(basePtr, valIndices), name);
-+}
-+
-+StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
-+{
-+    std::vector<Value*> valIndices;
-+    for (auto i : indices)
-+        valIndices.push_back(C(i));
-+    return STORE(val, GEPA(basePtr, valIndices));
-+}
-+
-+StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
-+{
-+    std::vector<Value*> valIndices;
-+    for (auto i : indices)
-+        valIndices.push_back(i);
-+    return STORE(val, GEPA(basePtr, valIndices));
-+}
-+
-+CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
-+{
-+    std::vector<Value*> args;
-+    for (auto arg : argsList)
-+        args.push_back(arg);
-+    return CALLA(Callee, args);
-+}
-+
-+Value *Builder::VRCP(Value *va)
-+{
-+    return FDIV(VIMMED1(1.0f), va);  // 1 / a
-+}
-+
-+Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
-+{
-+    Value* vOut = FMADDPS(vA, vX, vC);
-+    vOut = FMADDPS(vB, vY, vOut);
-+    return vOut;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate an i32 masked load operation in LLVM IR.  If not  
-+/// supported on the underlying platform, emulate it with float masked load
-+/// @param src - base address pointer for the load
-+/// @param vMask - SIMD wide mask that controls whether to access memory load 0
-+Value *Builder::MASKLOADD(Value* src,Value* mask)
-+{
-+    Value* vResult;
-+    // use avx2 gather instruction is available
-+    if(JM()->mArch.AVX2())
-+    {
-+        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
-+        vResult = CALL2(func,src,mask);   
-+    }
-+    else
-+    {
-+        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-+        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
-+        vResult = BITCAST(CALL2(func,src,fMask), VectorType::get(mInt32Ty,JM()->mVWidth));
-+    }
-+    return vResult;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief insert a JIT call to CallPrint
-+/// - outputs formatted string to both stdout and VS output window
-+/// - DEBUG builds only
-+/// Usage example:
-+///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
-+///   where C(lane) creates a constant value to print, and pIndex is the Value*
-+///   result from a GEP, printing out the pointer to memory
-+/// @param printStr - constant string to print, which includes format specifiers
-+/// @param printArgs - initializer list of Value*'s to print to std out
-+CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
-+{
-+#if defined( DEBUG ) || defined( _DEBUG )
-+    // push the arguments to CallPrint into a vector
-+    std::vector<Value*> printCallArgs;
-+    // save room for the format string.  we still need to modify it for vectors
-+    printCallArgs.resize(1);
-+
-+    // search through the format string for special processing
-+    size_t pos = 0;
-+    std::string tempStr(printStr);
-+    pos = tempStr.find('%', pos);
-+    auto v = printArgs.begin();
-+    // printing is slow.  now it's slower...
-+    while((pos != std::string::npos) && (v != printArgs.end()))
-+    {
-+        // for %f we need to cast float Values to doubles so that they print out correctly
-+        if((tempStr[pos+1]=='f') && ((*v)->getType()->isFloatTy()))
-+        {
-+            printCallArgs.push_back(FP_EXT(*v, Type::getDoubleTy(JM()->mContext)));
-+            pos++;
-+        }
-+        // add special handling for %f and %d format specifiers to make printing llvm vector types easier
-+        else if((*v)->getType()->isVectorTy())
-+        {
-+            if((tempStr[pos+1]=='f') && ((*v)->getType()->getContainedType(0)->isFloatTy()))
-+            {
-+                uint32_t i = 0;
-+                for( ; i < ((*v)->getType()->getVectorNumElements())-1; i++)
-+                {
-+                    tempStr.insert(pos, std::string("%f "));
-+                    pos+=3;
-+                    printCallArgs.push_back(FP_EXT(VEXTRACT(*v, C(i)), Type::getDoubleTy(JM()->mContext)));
-+                }
-+                printCallArgs.push_back(FP_EXT(VEXTRACT(*v,C(i)),Type::getDoubleTy(JM()->mContext)));
-+            }
-+            else if((tempStr[pos+1]=='d') && ((*v)->getType()->getContainedType(0)->isIntegerTy()))
-+            {
-+                uint32_t i = 0;
-+                for( ; i < ((*v)->getType()->getVectorNumElements())-1; i++)
-+                {
-+                    tempStr.insert(pos,std::string("%d "));
-+                    pos += 3;
-+                    printCallArgs.push_back(VEXTRACT(*v,C(i)));
-+                }
-+                printCallArgs.push_back(VEXTRACT(*v,C(i)));
-+            }
-+            else
-+            {
-+                /// not a supported vector to print
-+                /// @todo pointer types too
-+                SWR_ASSERT(0);
-+            }
-+        }
-+        else
-+        {
-+            printCallArgs.push_back(*v);
-+        }
-+ 
-+        // advance to the next arguement
-+        v++;
-+        pos = tempStr.find('%', ++pos);
-+    }
-+
-+    // create global variable constant string
-+    Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
-+    GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
-+    JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
-+
-+    // get a pointer to the first character in the constant string array
-+    std::vector<Constant*> geplist{C(0),C(0)};
-+    Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
-+
-+    // insert the pointer to the format string in the argument vector
-+    printCallArgs[0] = strGEP;
-+
-+    // get pointer to CallPrint function and insert decl into the module if needed
-+    std::vector<Type*> args;
-+    args.push_back(PointerType::get(mInt8Ty,0));
-+    FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
-+    Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
-+
-+    // if we haven't yet added the symbol to the symbol table
-+    if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
-+    {
-+        sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
-+    }
-+
-+    // insert a call to CallPrint
-+    return CALLA(callPrintFn,printCallArgs);
-+#else // #if defined( DEBUG ) || defined( _DEBUG )
-+    return nullptr;
-+#endif
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a masked gather operation in LLVM IR.  If not  
-+/// supported on the underlying platform, emulate it with loads
-+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-+/// @param pBase - Int8* base VB address pointer value
-+/// @param vIndices - SIMD wide value of VB byte offsets
-+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-+/// @param scale - value to scale indices by
-+Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
-+{
-+    Value* vGather;
-+
-+    // use avx2 gather instruction if available
-+    if(JM()->mArch.AVX2())
-+    {
-+        // force mask to <N x float>, required by vgather
-+        vMask = BITCAST(vMask, mSimdFP32Ty);
-+        vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
-+    }
-+    else
-+    {
-+        Value* pStack = STACKSAVE();
-+
-+        // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-+        Value* vSrcPtr = ALLOCA(vSrc->getType());
-+        STORE(vSrc, vSrcPtr);
-+
-+        vGather = VUNDEF_F();
-+        Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
-+        Value *vOffsets = MUL(vIndices,vScaleVec);
-+        Value *mask = MASK(vMask);
-+        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
-+        {
-+            // single component byte index
-+            Value *offset = VEXTRACT(vOffsets,C(i));
-+            // byte pointer to component
-+            Value *loadAddress = GEP(pBase,offset);
-+            loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
-+            // pointer to the value to load if we're masking off a component
-+            Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-+            Value *selMask = VEXTRACT(mask,C(i));
-+            // switch in a safe address to load if we're trying to access a vertex 
-+            Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-+            Value *val = LOAD(validAddress);
-+            vGather = VINSERT(vGather,val,C(i));
-+        }
-+        STACKRESTORE(pStack);
-+    }
-+
-+    return vGather;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a masked gather operation in LLVM IR.  If not  
-+/// supported on the underlying platform, emulate it with loads
-+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-+/// @param pBase - Int8* base VB address pointer value
-+/// @param vIndices - SIMD wide value of VB byte offsets
-+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-+/// @param scale - value to scale indices by
-+Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
-+{
-+    Value* vGather;
-+
-+    // use avx2 gather instruction if available
-+    if(JM()->mArch.AVX2())
-+    {
-+        vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
-+    }
-+    else
-+    {
-+        Value* pStack = STACKSAVE();
-+
-+        // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
-+        Value* vSrcPtr = ALLOCA(vSrc->getType());
-+        STORE(vSrc, vSrcPtr);
-+
-+        vGather = VUNDEF_I();
-+        Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
-+        Value *vOffsets = MUL(vIndices, vScaleVec);
-+        Value *mask = MASK(vMask);
-+        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
-+        {
-+            // single component byte index
-+            Value *offset = VEXTRACT(vOffsets, C(i));
-+            // byte pointer to component
-+            Value *loadAddress = GEP(pBase, offset);
-+            loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
-+            // pointer to the value to load if we're masking off a component
-+            Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
-+            Value *selMask = VEXTRACT(mask, C(i));
-+            // switch in a safe address to load if we're trying to access a vertex 
-+            Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
-+            Value *val = LOAD(validAddress, C(0));
-+            vGather = VINSERT(vGather, val, C(i));
-+        }
-+
-+        STACKRESTORE(pStack);
-+    }
-+    return vGather;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
-+Value* Builder::MASK(Value* vmask)
-+{
-+    Value* src = BITCAST(vmask, mSimdInt32Ty);
-+    return ICMP_SLT(src, VIMMED1(0));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
-+Value* Builder::VMASK(Value* mask)
-+{
-+    return S_EXT(mask, mSimdInt32Ty);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
-+/// supported on the underlying platform, emulate it
-+/// @param a - 256bit SIMD(32x8bit) of 8bit integer values
-+/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
-+/// Byte masks in lower 128 lane of b selects 8 bit values from lower 
-+/// 128bits of a, and vice versa for the upper lanes.  If the mask 
-+/// value is negative, '0' is inserted.
-+Value *Builder::PSHUFB(Value* a, Value* b)
-+{
-+    Value* res;
-+    // use avx2 pshufb instruction if available
-+    if(JM()->mArch.AVX2())
-+    {
-+        res = VPSHUFB(a, b);
-+    }
-+    else
-+    {
-+        Constant* cB = dyn_cast<Constant>(b);
-+        // number of 8 bit elements in b
-+        uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
-+        // output vector
-+        Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
-+
-+        // insert an 8 bit value from the high and low lanes of a per loop iteration
-+        numElms /= 2;
-+        for(uint32_t i = 0; i < numElms; i++)
-+        {
-+            ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
-+            ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
-+
-+            // extract values from constant mask
-+            char valLow128bLane =  (char)(cLow128b->getSExtValue());
-+            char valHigh128bLane = (char)(cHigh128b->getSExtValue());
-+
-+            Value* insertValLow128b;
-+            Value* insertValHigh128b;
-+
-+            // if the mask value is negative, insert a '0' in the respective output position
-+            // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
-+            insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
-+            insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
-+
-+            vShuf = VINSERT(vShuf, insertValLow128b, i);
-+            vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
-+        }
-+        res = vShuf;
-+    }
-+    return res;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 
-+/// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-+/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only 
-+/// lower 8 values are used.
-+Value *Builder::PMOVSXBD(Value* a)
-+{
-+    Value* res;
-+    // use avx2 byte sign extend instruction if available
-+    if(JM()->mArch.AVX2())
-+    {
-+        res = VPMOVSXBD(a);
-+    }
-+    else
-+    {
-+        // VPMOVSXBD output type
-+        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
-+        // Extract 8 values from 128bit lane and sign extend
-+        res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
-+    }
-+    return res;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 
-+/// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-+/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
-+Value *Builder::PMOVSXWD(Value* a)
-+{
-+    Value* res;
-+    // use avx2 word sign extend if available
-+    if(JM()->mArch.AVX2())
-+    {
-+        res = VPMOVSXWD(a);
-+    }
-+    else
-+    {
-+        // VPMOVSXWD output type
-+        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
-+        // Extract 8 values from 128bit lane and sign extend
-+        res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
-+    }
-+    return res;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
-+/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-+/// platform, emulate it
-+/// @param a - 256bit SIMD lane(8x32bit) of integer values.
-+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-+Value *Builder::PERMD(Value* a, Value* idx)
-+{
-+    Value* res;
-+    // use avx2 permute instruction if available
-+    if(JM()->mArch.AVX2())
-+    {
-+        // llvm 3.6.0 swapped the order of the args to vpermd
-+        res = VPERMD(idx, a);
-+    }
-+    else
-+    {
-+        res = VSHUFFLE(a, a, idx);
-+    }
-+    return res;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
-+/// in LLVM IR.  If not supported on the underlying platform, emulate it
-+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-+Value *Builder::CVTPH2PS(Value* a)
-+{
-+    if (JM()->mArch.F16C())
-+    {
-+        return VCVTPH2PS(a);
-+    }
-+    else
-+    {
-+        Value* vExt = S_EXT(a, mSimdInt32Ty);
-+        Value* sign = AND(vExt,0x80000000);
-+
-+        // normal case
-+        Value* mantissa = SHL(AND(vExt, 0x03ff), 13);
-+        Value* exponent = AND(vExt, 0x7c00);
-+        exponent = ADD(exponent, VIMMED1(0x1c000));
-+        exponent = SHL(exponent, 13);
-+
-+        Value* result = OR(OR(sign, mantissa), exponent);
-+
-+        // handle 0
-+        Value* zeroMask = ICMP_EQ(AND(vExt, 0x7fff), VIMMED1(0));
-+        result = SELECT(zeroMask, sign, result);
-+
-+        // handle infinity
-+        Value* infMask = ICMP_EQ(AND(vExt, 0x7c00), VIMMED1(0x7c00));
-+        Value* signedInf = OR(VIMMED1(0x7f800000), sign);
-+        result = SELECT(infMask, signedInf, result);
-+
-+        // @todo handle subnormal
-+
-+        // cast to f32
-+        result = BITCAST(result, mSimdFP32Ty);
-+        return result;
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
-+/// in LLVM IR.  If not supported on the underlying platform, emulate it
-+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-+Value *Builder::CVTPS2PH(Value* a, Value* rounding)
-+{
-+    if (JM()->mArch.F16C())
-+    {
-+        return VCVTPS2PH(a, rounding);
-+    }
-+    else
-+    {
-+        SWR_ASSERT(false, "Emulation of VCVTPH2PS unimplemented.");
-+        return nullptr;
-+    }
-+}
-+
-+Value *Builder::PMAXSD(Value* a, Value* b)
-+{
-+    if (JM()->mArch.AVX2())
-+    {
-+        return VPMAXSD(a, b);
-+    }
-+    else
-+    {
-+        // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
-+        Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
-+
-+        // low 128
-+        Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
-+        Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
-+        Value* resLo = CALL2(pmaxsd, aLo, bLo);
-+
-+        // high 128
-+        Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
-+        Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
-+        Value* resHi = CALL2(pmaxsd, aHi, bHi);
-+
-+        // combine 
-+        Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
-+        result = VINSERTI128(result, resHi, C((uint8_t)1));
-+
-+        return result;
-+    }
-+}
-+
-+Value *Builder::PMINSD(Value* a, Value* b)
-+{
-+    if (JM()->mArch.AVX2())
-+    {
-+        return VPMINSD(a, b);
-+    }
-+    else
-+    {
-+        // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
-+        Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
-+
-+        // low 128
-+        Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
-+        Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
-+        Value* resLo = CALL2(pminsd, aLo, bLo);
-+
-+        // high 128
-+        Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
-+        Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
-+        Value* resHi = CALL2(pminsd, aHi, bHi);
-+
-+        // combine 
-+        Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
-+        result = VINSERTI128(result, resHi, C((uint8_t)1));
-+
-+        return result;
-+    }
-+}
-+
-+void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
-+                      Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-+{
-+    const SWR_FORMAT_INFO &info = GetFormatInfo(format);
-+    if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
-+    {
-+        // ensure our mask is the correct type
-+        mask = BITCAST(mask, mSimdFP32Ty);
-+        GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-+    }
-+    else
-+    {
-+        // ensure our mask is the correct type
-+        mask = BITCAST(mask, mSimdInt32Ty);
-+        GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
-+    }
-+}
-+
-+void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 
-+                        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-+{
-+    switch(info.bpp / info.numComps)
-+    {
-+        case 16: 
-+        {
-+                Value* vGatherResult[2];
-+                Value *vMask;
-+
-+                // TODO: vGatherMaskedVal
-+                Value* vGatherMaskedVal = VIMMED1((float)0);
-+
-+                // always have at least one component out of x or y to fetch
-+
-+                // save mask as it is zero'd out after each gather
-+                vMask = mask;
-+
-+                vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-+                // e.g. result of first 8x32bit integer gather for 16bit components
-+                // 256i - 0    1    2    3    4    5    6    7
-+                //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-+                //
-+
-+                // if we have at least one component out of x or y to fetch
-+                if(info.numComps > 2)
-+                {
-+                    // offset base to the next components(zw) in the vertex to gather
-+                    pSrcBase = GEP(pSrcBase, C((char)4));
-+                    vMask = mask;
-+
-+                    vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-+                    // e.g. result of second 8x32bit integer gather for 16bit components
-+                    // 256i - 0    1    2    3    4    5    6    7
-+                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-+                    //
-+                }
-+                else
-+                {
-+                    vGatherResult[1] =  vGatherMaskedVal;
-+                }
-+
-+                // Shuffle gathered components into place, each row is a component
-+                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-+        }
-+            break;
-+        case 32: 
-+        { 
-+            // apply defaults
-+            for (uint32_t i = 0; i < 4; ++i)
-+            {
-+                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
-+            }
-+
-+            for(uint32_t i = 0; i < info.numComps; i++)
-+            {
-+                uint32_t swizzleIndex = info.swizzle[i];
-+
-+                // save mask as it is zero'd out after each gather
-+                Value *vMask = mask;
-+
-+                // Gather a SIMD of components
-+                vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
-+
-+                // offset base to the next component to gather
-+                pSrcBase = GEP(pSrcBase, C((char)4));
-+            }
-+        }
-+            break;
-+        default:
-+            SWR_ASSERT(0, "Invalid float format");
-+            break;
-+    }
-+}
-+
-+void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-+                        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
-+{
-+    switch (info.bpp / info.numComps)
-+    {
-+        case 8:
-+        {
-+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-+            Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
-+            // e.g. result of an 8x32bit integer gather for 8bit components
-+            // 256i - 0    1    2    3    4    5    6    7
-+            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
-+
-+            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
-+        }
-+            break;
-+        case 16:
-+        {
-+            Value* vGatherResult[2];
-+            Value *vMask;
-+
-+            // TODO: vGatherMaskedVal
-+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-+
-+            // always have at least one component out of x or y to fetch
-+
-+            // save mask as it is zero'd out after each gather
-+            vMask = mask;
-+
-+            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-+            // e.g. result of first 8x32bit integer gather for 16bit components
-+            // 256i - 0    1    2    3    4    5    6    7
-+            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-+            //
-+
-+            // if we have at least one component out of x or y to fetch
-+            if(info.numComps > 2)
-+            {
-+                // offset base to the next components(zw) in the vertex to gather
-+                pSrcBase = GEP(pSrcBase, C((char)4));
-+                vMask = mask;
-+
-+                vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
-+                // e.g. result of second 8x32bit integer gather for 16bit components
-+                // 256i - 0    1    2    3    4    5    6    7
-+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-+                //
-+            }
-+            else
-+            {
-+                vGatherResult[1] = vGatherMaskedVal;
-+            }
-+
-+            // Shuffle gathered components into place, each row is a component
-+            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-+
-+        }
-+            break;
-+        case 32:
-+        {
-+            // apply defaults
-+            for (uint32_t i = 0; i < 4; ++i)
-+            {
-+                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
-+            }
-+
-+            for(uint32_t i = 0; i < info.numComps; i++)
-+            {
-+                uint32_t swizzleIndex = info.swizzle[i];
-+
-+                // save mask as it is zero'd out after each gather
-+                Value *vMask = mask;
-+
-+                // Gather a SIMD of components
-+                vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
-+
-+                // offset base to the next component to gather
-+                pSrcBase = GEP(pSrcBase, C((char)4));
-+            }
-+        }
-+            break;
-+        default:
-+            SWR_ASSERT(0, "unsupported format");
-+        break;
-+    }
-+}
-+
-+void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
-+{
-+    // cast types
-+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-+    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
-+
-+    // input could either be float or int vector; do shuffle work in int
-+    vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
-+    vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
-+
-+    if(bPackedOutput) 
-+    {
-+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
-+
-+        // shuffle mask
-+        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-+                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-+        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
-+        // after pshufb: group components together in each 128bit lane
-+        // 256i - 0    1    2    3    4    5    6    7
-+        //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-+
-+        Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-+        // after PERMD: move and pack xy components into each 128bit lane
-+        // 256i - 0    1    2    3    4    5    6    7
-+        //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-+
-+        // do the same for zw components
-+        Value* vi128ZW = nullptr;
-+        if(info.numComps > 2) 
-+        {
-+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-+        }
-+
-+        for(uint32_t i = 0; i < 4; i++)
-+        {
-+            uint32_t swizzleIndex = info.swizzle[i];
-+            // todo: fixed for packed
-+            Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-+            if(i >= info.numComps)
-+            {
-+                // set the default component val
-+                vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-+                continue;
-+            }
-+
-+            // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-+            uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-+            // if x or y, use vi128XY permute result, else use vi128ZW
-+            Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-+
-+            // extract packed component 128 bit lanes 
-+            vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-+        }
-+
-+    }
-+    else 
-+    {
-+        // pshufb masks for each component
-+        Value* vConstMask[2];
-+        // x/z shuffle mask
-+        vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-+                                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-+
-+        // y/w shuffle mask
-+        vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-+                                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-+
-+
-+        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-+        // apply defaults
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-+        }
-+
-+        for(uint32_t i = 0; i < info.numComps; i++)
-+        {
-+            uint32_t swizzleIndex = info.swizzle[i];
-+
-+            // select correct constMask for x/z or y/w pshufb
-+            uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-+            // if x or y, use vi128XY permute result, else use vi128ZW
-+            uint32_t selectedGather = (i < 2) ? 0 : 1;
-+
-+            vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-+            // after pshufb mask for x channel; z uses the same shuffle from the second gather
-+            // 256i - 0    1    2    3    4    5    6    7
-+            //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
-+        }
-+    }
-+}
-+
-+void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
-+{
-+    // cast types
-+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-+    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
-+
-+    if(bPackedOutput)
-+    {
-+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
-+        // shuffle mask
-+        Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-+                                     0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
-+        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-+        // after pshufb: group components together in each 128bit lane
-+        // 256i - 0    1    2    3    4    5    6    7
-+        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-+
-+        Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
-+        // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-+        // 256i - 0    1    2    3    4    5    6    7
-+        //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-+
-+        // do the same for zw components
-+        Value* vi128ZW = nullptr;
-+        if(info.numComps > 2) 
-+        {
-+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
-+        }
-+
-+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
-+        for(uint32_t i = 0; i < 4; i++)
-+        {
-+            uint32_t swizzleIndex = info.swizzle[i];
-+            // todo: fix for packed
-+            Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
-+            if(i >= info.numComps)
-+            {
-+                // set the default component val
-+                vGatherOutput[swizzleIndex] = vGatherMaskedVal;
-+                continue;
-+            }
-+
-+            // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-+            uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
-+            // if x or y, use vi128XY permute result, else use vi128ZW
-+            Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-+            
-+            // sign extend
-+            vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
-+        }
-+    }
-+    // else zero extend
-+    else{
-+        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-+        // apply defaults
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
-+        }
-+
-+        for(uint32_t i = 0; i < info.numComps; i++){
-+            uint32_t swizzleIndex = info.swizzle[i];
-+
-+            // pshufb masks for each component
-+            Value* vConstMask;
-+            switch(i)
-+            {
-+                case 0:
-+                    // x shuffle mask
-+                    vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-+                                          0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-+                    break;
-+                case 1:
-+                    // y shuffle mask
-+                    vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-+                                          1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-+                    break;
-+                case 2:
-+                    // z shuffle mask
-+                    vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-+                                          2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-+                    break;
-+                case 3:
-+                    // w shuffle mask
-+                    vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-+                                          3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-+                    break;
-+                default:
-+                    vConstMask = nullptr;
-+                    break;
-+            }
-+
-+                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
-+                // after pshufb for x channel
-+                // 256i - 0    1    2    3    4    5    6    7
-+                //        x000 x000 x000 x000 x000 x000 x000 x000 
-+        }
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief emulates a scatter operation.
-+/// @param pDst - pointer to destination 
-+/// @param vSrc - vector of src data to scatter
-+/// @param vOffsets - vector of byte offsets from pDst
-+/// @param vMask - mask of valid lanes
-+void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
-+{
-+    Value* pStack = STACKSAVE();
-+
-+    // allocate tmp stack for masked off lanes
-+    Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
-+
-+    Value *mask = MASK(vMask);
-+    for (uint32_t i = 0; i < JM()->mVWidth; ++i)
-+    {
-+        Value *offset = VEXTRACT(vOffsets, C(i));
-+        // byte pointer to component
-+        Value *storeAddress = GEP(pDst, offset);
-+        storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
-+        Value *selMask = VEXTRACT(mask, C(i));
-+        Value *srcElem = VEXTRACT(vSrc, C(i));
-+        // switch in a safe address to load if we're trying to access a vertex 
-+        Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
-+        STORE(srcElem, validAddress);
-+    }
-+
-+    STACKRESTORE(pStack);
-+}
-+
-+Value* Builder::VABSPS(Value* a)
-+{
-+    Value* asInt = BITCAST(a, mSimdInt32Ty);
-+    Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
-+    return result;
-+}
-+
-+Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
-+{
-+    Value *lowCmp = ICMP_SLT(src, low);
-+    Value *ret = SELECT(lowCmp, low, src);
-+
-+    Value *highCmp = ICMP_SGT(ret, high);
-+    ret = SELECT(highCmp, high, ret);
-+
-+    return ret;
-+}
-+
-+Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
-+{
-+    Value *lowCmp = FCMP_OLT(src, low);
-+    Value *ret = SELECT(lowCmp, low, src);
-+
-+    Value *highCmp = FCMP_OGT(ret, high);
-+    ret = SELECT(highCmp, high, ret);
-+
-+    return ret;
-+}
-+
-+Value *Builder::FCLAMP(Value* src, float low, float high)
-+{
-+    Value* result = VMAXPS(src, VIMMED1(low));
-+    result = VMINPS(result, VIMMED1(high));
-+
-+    return result;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief save/restore stack, providing ability to push/pop the stack and 
-+///        reduce overall stack requirements for temporary stack use
-+Value* Builder::STACKSAVE()
-+{
-+    Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-+    return CALL(pfnStackSave);
-+}
-+
-+void Builder::STACKRESTORE(Value* pSaved)
-+{
-+    Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
-+    CALL(pfnStackRestore, pSaved);
-+}
-+
-+Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
-+{
-+    Value* vOut;
-+    // use FMADs if available
-+    if(JM()->mArch.AVX2())
-+    {
-+        vOut = VFMADDPS(a, b, c);
-+    }
-+    else
-+    {
-+        vOut = FADD(FMUL(a, b), c);
-+    }
-+    return vOut;
-+}
-+
-+Value* Builder::POPCNT(Value* a)
-+{
-+    Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
-+    return CALL(pCtPop, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief C functions called by LLVM IR
-+//////////////////////////////////////////////////////////////////////////
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief called in JIT code, inserted by PRINT
-+/// output to both stdout and visual studio debug console
-+void __cdecl CallPrint(const char* fmt, ...)
-+{
-+#if defined( DEBUG ) || defined( _DEBUG )
-+    va_list args;
-+    va_start(args, fmt);
-+    vprintf(fmt, args);
-+
-+#if defined( _WIN32 )
-+    char strBuf[1024];
-+    vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
-+    OutputDebugString(strBuf);
-+#endif
-+#endif // #if defined( DEBUG ) || defined( _DEBUG )
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
-new file mode 100644
-index 0000000..8a32c6a
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
-@@ -0,0 +1,141 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder_misc.h
-+* 
-+* @brief miscellaneous builder functions
-+* 
-+* Notes:
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+Constant *C(bool i);
-+Constant *C(char i);
-+Constant *C(uint8_t i);
-+Constant *C(int i);
-+Constant *C(int64_t i);
-+Constant *C(UINT16 i);
-+Constant *C(uint32_t i);
-+Constant *C(float i);
-+
-+template<typename Ty>
-+Constant *C(const std::initializer_list<Ty> &constList)
-+{
-+    std::vector<Constant*> vConsts;
-+    for(auto i : constList) {
-+
-+        vConsts.push_back(C((Ty)i));
-+    }
-+    return ConstantVector::get(vConsts);
-+}
-+
-+Constant *PRED(bool pred);
-+Value *VIMMED1(int i);
-+Value *VIMMED1(uint32_t i);
-+Value *VIMMED1(float i);
-+Value *VIMMED1(bool i);
-+Value *VUNDEF(Type* t);
-+Value *VUNDEF_F();
-+Value *VUNDEF_I();
-+Value *VUNDEF(Type* ty, uint32_t size);
-+Value *VUNDEF_IPTR();
-+Value *VINSERT(Value *vec, Value *val, int index);
-+Value *VBROADCAST(Value *src);
-+Value *VRCP(Value *va);
-+Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
-+
-+uint32_t IMMED(Value* i);
-+
-+Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-+Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
-+CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args);
-+
-+LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
-+LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
-+StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
-+StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
-+
-+Value *VCMPPS_EQ(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); }
-+Value *VCMPPS_LT(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); }
-+Value *VCMPPS_LE(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); }
-+Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_UNORD_Q)); }
-+Value *VCMPPS_NEQ(Value* a, Value* b)   { return VCMPPS(a, b, C((uint8_t)_CMP_NEQ_OQ)); }
-+Value *VCMPPS_GE(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_GE_OQ)); }
-+Value *VCMPPS_GT(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); }
-+Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); }
-+
-+Value *MASK(Value* vmask);
-+Value *VMASK(Value* mask);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief functions that build IR to call x86 intrinsics directly, or
-+/// emulate them with other instructions if not available on the host
-+//////////////////////////////////////////////////////////////////////////
-+Value *MASKLOADD(Value* src, Value* mask);
-+
-+void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
-+                      Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-+
-+Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
-+void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-+               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-+
-+Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
-+void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-+               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
-+
-+void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
-+
-+void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
-+void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
-+
-+Value *PSHUFB(Value* a, Value* b);
-+Value *PMOVSXBD(Value* a);
-+Value *PMOVSXWD(Value* a);
-+Value *PERMD(Value* a, Value* idx);
-+Value *CVTPH2PS(Value* a);
-+Value *CVTPS2PH(Value* a, Value* rounding);
-+Value *PMAXSD(Value* a, Value* b);
-+Value *PMINSD(Value* a, Value* b);
-+Value *VABSPS(Value* a);
-+Value *FMADDPS(Value* a, Value* b, Value* c);
-+
-+// LLVM removed VPCMPGTD x86 intrinsic.  This emulates that behavior
-+Value *VPCMPGTD(Value* a, Value* b)
-+{
-+    Value* vIndexMask = ICMP_UGT(a,b);
-+
-+    // need to set the high bit for x86 intrinsic masks
-+    return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth));
-+}
-+
-+Value *ICLAMP(Value* src, Value* low, Value* high);
-+Value *FCLAMP(Value* src, Value* low, Value* high);
-+Value *FCLAMP(Value* src, float low, float high);
-+
-+CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
-+Value* STACKSAVE();
-+void STACKRESTORE(Value* pSaved);
-+
-+Value* POPCNT(Value* a);
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp
-new file mode 100644
-index 0000000..b4ae075
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp
-@@ -0,0 +1,242 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder_x86.cpp
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#include "builder.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256);
-+    return IRB()->CreateCall5(func, src, pBase, indices, mask, scale);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
-+    return IRB()->CreateCall5(func, src, pBase, indices, mask, scale);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VSQRTPS(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_sqrt_ps_256);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VRSQRTPS(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_rsqrt_ps_256);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VRCPPS(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_rcp_ps_256);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VMINPS(Value* a, Value* b)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_min_ps_256);
-+    return IRB()->CreateCall2(func, a, b);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VMAXPS(Value* a, Value* b)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_max_ps_256);
-+    return IRB()->CreateCall2(func, a, b);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VPMINSD(Value* a, Value* b)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
-+    return IRB()->CreateCall2(func, a, b);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VPMAXSD(Value* a, Value* b)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
-+    return IRB()->CreateCall2(func, a, b);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VROUND(Value* a, Value* rounding)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
-+    return IRB()->CreateCall2(func, a, rounding);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VCMPPS(Value* a, Value* b, Value* cmpop)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_cmp_ps_256);
-+    return IRB()->CreateCall3(func, a, b, cmpop);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VBLENDVPS(Value* a, Value* b, Value* mask)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_blendv_ps_256);
-+    return IRB()->CreateCall3(func, a, b, mask);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::BEXTR_32(Value* src, Value* control)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_bmi_bextr_32);
-+    return IRB()->CreateCall2(func, src, control);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VMASKLOADD(Value* src, Value* mask)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
-+    return IRB()->CreateCall2(func, src, mask);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VMASKMOVPS(Value* src, Value* mask)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
-+    return IRB()->CreateCall2(func, src, mask);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VPSHUFB(Value* a, Value* b)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pshuf_b);
-+    return IRB()->CreateCall2(func, a, b);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VPMOVSXBD(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VPMOVSXWD(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VPERMD(Value* idx, Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_permd);
-+    return IRB()->CreateCall2(func, idx, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VCVTPH2PS(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtph2ps_256);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VCVTPS2PH(Value* a, Value* round)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtps2ph_256);
-+    return IRB()->CreateCall2(func, a, round);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VEXTRACTF128(Value* a, Value* imm8)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_ps_256);
-+    return IRB()->CreateCall2(func, a, imm8);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VEXTRACTI128(Value* a, Value* imm8)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_si_256);
-+    return IRB()->CreateCall2(func, a, imm8);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VINSERTF128(Value* a, Value* b, Value* imm8)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_ps_256);
-+    return IRB()->CreateCall3(func, a, b, imm8);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VINSERTI128(Value* a, Value* b, Value* imm8)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_si_256);
-+    return IRB()->CreateCall3(func, a, b, imm8);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VHSUBPS(Value* a, Value* b)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
-+    return IRB()->CreateCall2(func, a, b);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VPTESTC(Value* a, Value* b)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_ptestc_256);
-+    return IRB()->CreateCall2(func, a, b);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VFMADDPS(Value* a, Value* b, Value* c)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_fma_vfmadd_ps_256);
-+    return IRB()->CreateCall3(func, a, b, c);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VCVTTPS2DQ(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_cvtt_ps2dq_256);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+Value *Builder::VMOVMSKPS(Value* a)
-+{
-+    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_movmsk_ps_256);
-+    return IRB()->CreateCall(func, a);
-+}
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h
-new file mode 100644
-index 0000000..bdaabca
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h
-@@ -0,0 +1,65 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file builder_x86.h
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#pragma once
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Auto-generated x86 intrinsics
-+//////////////////////////////////////////////////////////////////////////
-+Value *VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
-+Value *VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
-+Value *VSQRTPS(Value* a);
-+Value *VRSQRTPS(Value* a);
-+Value *VRCPPS(Value* a);
-+Value *VMINPS(Value* a, Value* b);
-+Value *VMAXPS(Value* a, Value* b);
-+Value *VPMINSD(Value* a, Value* b);
-+Value *VPMAXSD(Value* a, Value* b);
-+Value *VROUND(Value* a, Value* rounding);
-+Value *VCMPPS(Value* a, Value* b, Value* cmpop);
-+Value *VBLENDVPS(Value* a, Value* b, Value* mask);
-+Value *BEXTR_32(Value* src, Value* control);
-+Value *VMASKLOADD(Value* src, Value* mask);
-+Value *VMASKMOVPS(Value* src, Value* mask);
-+Value *VPSHUFB(Value* a, Value* b);
-+Value *VPMOVSXBD(Value* a);
-+Value *VPMOVSXWD(Value* a);
-+Value *VPERMD(Value* idx, Value* a);
-+Value *VCVTPH2PS(Value* a);
-+Value *VCVTPS2PH(Value* a, Value* round);
-+Value *VEXTRACTF128(Value* a, Value* imm8);
-+Value *VEXTRACTI128(Value* a, Value* imm8);
-+Value *VINSERTF128(Value* a, Value* b, Value* imm8);
-+Value *VINSERTI128(Value* a, Value* b, Value* imm8);
-+Value *VHSUBPS(Value* a, Value* b);
-+Value *VPTESTC(Value* a, Value* b);
-+Value *VFMADDPS(Value* a, Value* b, Value* c);
-+Value *VCVTTPS2DQ(Value* a);
-+Value *VMOVMSKPS(Value* a);
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
-new file mode 100644
-index 0000000..1b87769
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
-@@ -0,0 +1,1450 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file fetch_jit.cpp
-+*
-+* @brief Implementation of the fetch jitter
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#include "jit_api.h"
-+#include "fetch_jit.h"
-+#include "builder.h"
-+#include "state_llvm.h"
-+#include "common/containers.hpp"
-+#include "llvm/IR/DataLayout.h"
-+#include <sstream>
-+#include <tuple>
-+
-+//#define FETCH_DUMP_VERTEX 1
-+
-+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
-+
-+enum ConversionType
-+{
-+    CONVERT_NONE,
-+    CONVERT_NORMALIZED,
-+    CONVERT_USCALED,
-+    CONVERT_SSCALED,
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Interface to Jitting a fetch shader
-+//////////////////////////////////////////////////////////////////////////
-+struct FetchJit : public Builder
-+{
-+    FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
-+
-+    Function* Create(const FETCH_COMPILE_STATE& fetchState);
-+    Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
-+    Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
-+    Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
-+
-+    // package up Shuffle*bpcGatherd args into a tuple for convenience
-+    typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType, 
-+                       uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
-+                       const uint32_t (&)[4]> Shuffle8bpcArgs;
-+    void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
-+
-+    typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
-+                       uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
-+    void Shuffle16bpcGather(Shuffle16bpcArgs &args);
-+
-+    void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
-+
-+    Value* GenerateCompCtrlVector(const ComponentControl ctrl);
-+
-+    void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
-+    void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
-+};
-+
-+Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
-+{
-+    static std::size_t fetchNum = 0;
-+
-+    std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-+    fnName << fetchNum++;
-+
-+    Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
-+    BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
-+
-+    IRB()->SetInsertPoint(entry);
-+
-+    auto    argitr = fetch->getArgumentList().begin();
-+
-+    // Fetch shader arguments
-+    Value*    fetchInfo = argitr; ++argitr;
-+    fetchInfo->setName("fetchInfo");
-+    Value*    pVtxOut = argitr;
-+    pVtxOut->setName("vtxOutput");
-+    // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
-+    // index 0(just the pointer to the simdvertex structure
-+    // index 1(which element of the simdvertex structure to offset to(in this case 0)
-+    // so the indices being i32's doesn't matter
-+    // TODO: generated this GEP with a VECTOR structure type so this makes sense
-+    std::vector<Value*>    vtxInputIndices(2, C(0));
-+    // GEP
-+    pVtxOut = GEP(pVtxOut, C(0));
-+    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
-+
-+    // SWR_FETCH_CONTEXT::pStreams
-+    Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
-+    streams->setName("pStreams");
-+
-+    // SWR_FETCH_CONTEXT::pIndices
-+    Value*    indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
-+    indices->setName("pIndices");
-+
-+    // SWR_FETCH_CONTEXT::pLastIndex
-+    Value*    pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
-+    pLastIndex->setName("pLastIndex");
-+    
-+
-+    Value* vIndices;
-+    switch(fetchState.indexType)
-+    {
-+        case R8_UINT:
-+            indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
-+            if(fetchState.bDisableIndexOOBCheck){
-+                vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
-+                vIndices = Z_EXT(vIndices, mSimdInt32Ty);
-+            }
-+            else{
-+                pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
-+                vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
-+            }
-+            break;
-+        case R16_UINT: 
-+            indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); 
-+            if(fetchState.bDisableIndexOOBCheck){
-+                vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
-+                vIndices = Z_EXT(vIndices, mSimdInt32Ty);
-+            }
-+            else{
-+                pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
-+                vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
-+            }
-+            break;
-+        case R32_UINT:
-+            (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
-+                                               : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
-+            break; // incoming type is already 32bit int
-+        default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
-+    }
-+
-+    // store out vertex IDs
-+    STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
-+
-+    // store out cut mask if enabled
-+    if (fetchState.bEnableCutIndex)
-+    {
-+        Value* vCutIndex = VIMMED1(fetchState.cutIndex);
-+        Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
-+        STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
-+    }
-+
-+    // Fetch attributes from memory and output to a simdvertex struct
-+    // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
-+    (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
-+                                 : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
-+
-+    RET_VOID();
-+
-+    //#define KNOB_SWRC_TRACING
-+
-+#if defined(KNOB_SWRC_TRACING)
-+    std::string err;
-+    char fName[1024];
-+    const char *funcName = fetch->getName().data();
-+    sprintf(fName, "%s.ll", funcName);
-+    raw_fd_ostream fetchFD(fName, err, LLVM_F_NONE);
-+    fetch->print(fetchFD);
-+    fetchFD.flush();
-+#endif
-+    verifyFunction(*fetch);
-+
-+    FunctionPassManager setupPasses(JM()->mpCurrentModule);
-+
-+    ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
-+    setupPasses.add(createBreakCriticalEdgesPass());
-+    setupPasses.add(createCFGSimplificationPass());
-+    setupPasses.add(createEarlyCSEPass());
-+    setupPasses.add(createPromoteMemoryToRegisterPass());
-+
-+    setupPasses.run(*fetch);
-+
-+#if defined(KNOB_SWRC_TRACING)
-+    sprintf(fName, "%s.se.ll", funcName);
-+    raw_fd_ostream seFetchFD(fName, err, LLVM_F_NONE);
-+    fetch->print(seFetchFD);
-+    seFetchFD.flush();
-+#endif
-+
-+    FunctionPassManager optPasses(JM()->mpCurrentModule);
-+
-+    ///@todo Haven't touched these either. Need to remove some of these and add others.
-+    optPasses.add(createCFGSimplificationPass());
-+    optPasses.add(createEarlyCSEPass());
-+    optPasses.add(createInstructionCombiningPass());
-+    optPasses.add(createInstructionSimplifierPass());
-+    optPasses.add(createConstantPropagationPass());
-+    optPasses.add(createSCCPPass());
-+    optPasses.add(createAggressiveDCEPass());
-+
-+    optPasses.run(*fetch);
-+    optPasses.run(*fetch);
-+
-+#if defined(KNOB_SWRC_TRACING)
-+    sprintf(fName, "%s.opt.ll", funcName);
-+    raw_fd_ostream optFetchFD(fName, err, LLVM_F_NONE);
-+    fetch->print(optFetchFD);
-+    optFetchFD.flush();
-+#endif
-+
-+    return fetch;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Loads attributes from memory using LOADs, shuffling the 
-+/// components into SOA form. 
-+/// *Note* currently does not support component control,
-+/// component packing, or instancing
-+/// @param fetchState - info about attributes to be fetched from memory
-+/// @param streams - value pointer to the current vertex stream
-+/// @param vIndices - vector value of indices to load
-+/// @param pVtxOut - value pointer to output simdvertex struct
-+void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
-+{
-+    // Zack shuffles; a variant of the Charleston.
-+
-+    SWRL::UncheckedFixedVector<Value*, 16>    vectors;
-+
-+    std::vector<Constant*>    pMask(JM()->mVWidth);
-+    for(uint32_t i = 0; i < JM()->mVWidth; ++i)
-+    {
-+        pMask[i] = (C(i < 4 ? i : 4));
-+    }
-+    Constant* promoteMask = ConstantVector::get(pMask);
-+    Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
-+
-+    Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
-+
-+    for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
-+    {
-+        Value*    elements[4] = {0};
-+        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
-+        const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
-+        uint32_t    numComponents = info.numComps;
-+        uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
-+
-+        vectors.clear();
-+
-+        // load SWR_VERTEX_BUFFER_STATE::pData
-+        Value *stream = LOAD(streams, {ied.StreamIndex, 2});
-+
-+        // load SWR_VERTEX_BUFFER_STATE::pitch
-+        Value *stride = LOAD(streams, {ied.StreamIndex, 1});
-+        stride = Z_EXT(stride, mInt64Ty);
-+
-+        // load SWR_VERTEX_BUFFER_STATE::size
-+        Value *size = LOAD(streams, {ied.StreamIndex, 3});
-+        size = Z_EXT(size, mInt64Ty);
-+
-+        Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
-+
-+        // Load from the stream.
-+        for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
-+        {
-+            // Get index
-+            Value* index = VEXTRACT(vIndices, C(lane));
-+            index = Z_EXT(index, mInt64Ty);
-+
-+            Value*    offset = MUL(index, stride);
-+            offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
-+            offset = ADD(offset, startVertexOffset);
-+
-+            if (!fetchState.bDisableIndexOOBCheck) {
-+                // check for out of bound access, including partial OOB, and mask them to 0
-+                Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
-+                Value *oob = ICMP_ULE(endOffset, size);
-+                offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
-+            }
-+
-+            Value*    pointer = GEP(stream, offset);
-+            // We use a full-lane, but don't actually care.
-+            Value*    vptr = 0;
-+
-+            // get a pointer to a 4 component attrib in default address space
-+            switch(bpc)
-+            {
-+                case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
-+                case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
-+                case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
-+                default: SWR_ASSERT(false, "Unsupported underlying bpp!");
-+            }
-+
-+            // load 4 components of attribute
-+            Value*    vec = ALIGNED_LOAD(vptr, 1, false);
-+
-+            // Convert To FP32 internally
-+            switch(info.type[0])
-+            {
-+                case SWR_TYPE_UNORM:
-+                    switch(bpc)
-+                    {
-+                        case 8:
-+                            vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
-+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
-+                            break;
-+                        case 16:
-+                            vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
-+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
-+                            break;
-+                        default:
-+                            SWR_ASSERT(false, "Unsupported underlying type!");
-+                            break;
-+                    }
-+                    break;
-+                case SWR_TYPE_SNORM:
-+                    switch(bpc)
-+                    {
-+                        case 8:
-+                            vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
-+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
-+                            break;
-+                        case 16:
-+                            vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
-+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
-+                            break;
-+                        default:
-+                            SWR_ASSERT(false, "Unsupported underlying type!");
-+                            break;
-+                    }
-+                    break;
-+                case SWR_TYPE_UINT:
-+                    // Zero extend uint32_t types.
-+                    switch(bpc)
-+                    {
-+                        case 8:
-+                        case 16:
-+                            vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
-+                            vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
-+                            break;
-+                        case 32:
-+                            break; // Pass through unchanged.
-+                        default:
-+                            SWR_ASSERT(false, "Unsupported underlying type!");
-+                            break;
-+                    }
-+                    break;
-+                case SWR_TYPE_SINT:
-+                    // Sign extend SINT types.
-+                    switch(bpc)
-+                    {
-+                        case 8:
-+                        case 16:
-+                            vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
-+                            vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
-+                            break;
-+                        case 32:
-+                            break; // Pass through unchanged.
-+                        default:
-+                            SWR_ASSERT(false, "Unsupported underlying type!");
-+                            break;
-+                    }
-+                    break;
-+                case SWR_TYPE_FLOAT:
-+                    switch(bpc)
-+                    {
-+                        case 32:
-+                            break; // Pass through unchanged.
-+                        default:
-+                            SWR_ASSERT(false, "Unsupported underlying type!");
-+                    }
-+                    break;
-+                case SWR_TYPE_USCALED:
-+                    vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
-+                    break;
-+                case SWR_TYPE_SSCALED:
-+                    vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
-+                    break;
-+                case SWR_TYPE_UNKNOWN:
-+                case SWR_TYPE_UNUSED:
-+                    SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
-+            }
-+
-+            // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
-+            // uwvec: 4 x F32, undef value
-+            Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
-+            vectors.push_back(wvec);
-+        }
-+
-+        std::vector<Constant*>        v01Mask(JM()->mVWidth);
-+        std::vector<Constant*>        v23Mask(JM()->mVWidth);
-+        std::vector<Constant*>        v02Mask(JM()->mVWidth);
-+        std::vector<Constant*>        v13Mask(JM()->mVWidth);
-+
-+        // Concatenate the vectors together.
-+        elements[0] = VUNDEF_F(); 
-+        elements[1] = VUNDEF_F(); 
-+        elements[2] = VUNDEF_F(); 
-+        elements[3] = VUNDEF_F(); 
-+        for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
-+        {
-+            v01Mask[4 * b + 0] = C(0 + 4 * b);
-+            v01Mask[4 * b + 1] = C(1 + 4 * b);
-+            v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-+            v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
-+
-+            v23Mask[4 * b + 0] = C(2 + 4 * b);
-+            v23Mask[4 * b + 1] = C(3 + 4 * b);
-+            v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
-+            v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
-+
-+            v02Mask[4 * b + 0] = C(0 + 4 * b);
-+            v02Mask[4 * b + 1] = C(2 + 4 * b);
-+            v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-+            v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
-+
-+            v13Mask[4 * b + 0] = C(1 + 4 * b);
-+            v13Mask[4 * b + 1] = C(3 + 4 * b);
-+            v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
-+            v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
-+
-+            std::vector<Constant*>    iMask(JM()->mVWidth);
-+            for(uint32_t i = 0; i < JM()->mVWidth; ++i)
-+            {
-+                if(((4 * b) <= i) && (i < (4 * (b + 1))))
-+                {
-+                    iMask[i] = C(i % 4 + JM()->mVWidth);
-+                }
-+                else
-+                {
-+                    iMask[i] = C(i);
-+                }
-+            }
-+            Constant* insertMask = ConstantVector::get(iMask);
-+            elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
-+            elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
-+            elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
-+            elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
-+        }
-+
-+        Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
-+        Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
-+        Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
-+        Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
-+        elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
-+        elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
-+        elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
-+        elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
-+
-+        switch(numComponents + 1)
-+        {
-+            case    1: elements[0] = VIMMED1(0.0f);
-+            case    2: elements[1] = VIMMED1(0.0f);
-+            case    3: elements[2] = VIMMED1(0.0f);
-+            case    4: elements[3] = VIMMED1(1.0f);
-+        }
-+
-+        for(uint32_t c = 0; c < 4; ++c)
-+        {
-+            Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
-+            STORE(elements[c], dest);
-+        }
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Loads attributes from memory using AVX2 GATHER(s)
-+/// @param fetchState - info about attributes to be fetched from memory
-+/// @param fetchInfo - first argument passed to fetch shader
-+/// @param streams - value pointer to the current vertex stream
-+/// @param vIndices - vector value of indices to gather
-+/// @param pVtxOut - value pointer to output simdvertex struct
-+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
-+                                 Value* streams, Value* vIndices, Value* pVtxOut)
-+{
-+    uint32_t currentVertexElement = 0;
-+    uint32_t outputElt = 0;
-+    Value* vVertexElements[4];
-+
-+    Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
-+    Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
-+    Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
-+    Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
-+    curInstance->setName("curInstance");
-+
-+    for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
-+    {
-+        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
-+        const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
-+        uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
-+
-+        Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
-+
-+        // VGATHER* takes an *i8 src pointer
-+        Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
-+
-+        Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
-+        Value *vStride = VBROADCAST(stride);
-+
-+        // max vertex index that is fully in bounds
-+        Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
-+        maxVertex = LOAD(maxVertex);
-+
-+        Value *vCurIndices;
-+        Value *startOffset;
-+        if(ied.InstanceEnable)
-+        {
-+            Value* stepRate = C(ied.InstanceDataStepRate);
-+
-+            // prevent a div by 0 for 0 step rate
-+            Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
-+            stepRate = SELECT(isNonZeroStep, stepRate, C(1));
-+
-+            // calc the current offset into instanced data buffer
-+            Value* calcInstance = UDIV(curInstance, stepRate);
-+
-+            // if step rate is 0, every instance gets instance 0
-+            calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
-+
-+            vCurIndices = VBROADCAST(calcInstance);
-+
-+            startOffset = startInstance;
-+        }
-+        else
-+        {
-+            // offset indices by baseVertex            
-+            vCurIndices = ADD(vIndices, vBaseVertex);
-+
-+            startOffset = startVertex;
-+        }
-+
-+        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to 
-+        // do 64bit address offset calculations.
-+
-+        // calculate byte offset to the start of the VB
-+        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
-+        pStreamBase = GEP(pStreamBase, baseOffset);
-+
-+        // if we have a start offset, subtract from max vertex. Used for OOB check
-+        maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
-+        Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
-+        // if we have a negative value, we're already OOB. clamp at 0.
-+        maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
-+
-+        // Load the in bounds size of a partially valid vertex
-+        Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
-+        partialInboundsSize = LOAD(partialInboundsSize);
-+        Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
-+        Value* vBpp = VBROADCAST(C(info.Bpp));
-+
-+        // is the element is <= the partially valid size
-+        Value* vElementInBoundsMask = ICMP_ULE(vBpp, vPartialVertexSize);
-+
-+        // are vertices partially OOB?
-+        Value* vMaxVertex = VBROADCAST(maxVertex);
-+        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
-+
-+        // are vertices are fully in bounds?
-+        Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
-+
-+        // blend in any partially OOB indices that have valid elements
-+        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
-+        vGatherMask = VMASK(vGatherMask);
-+
-+        // calculate the actual offsets into the VB
-+        Value* vOffsets = MUL(vCurIndices, vStride);
-+        Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
-+        vOffsets = ADD(vOffsets, vAlignmentOffsets);
-+
-+        // Packing and component control 
-+        ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
-+        const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, 
-+                                             (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; 
-+
-+        if(info.type[0] == SWR_TYPE_FLOAT)
-+        {
-+            ///@todo: support 64 bit vb accesses
-+            Value* gatherSrc = VIMMED1(0.0f);
-+
-+            // Gather components from memory to store in a simdvertex structure
-+            switch(bpc)
-+            {
-+                case 16:
-+                {
-+                    Value* vGatherResult[2];
-+                    Value *vMask;
-+
-+                    // if we have at least one component out of x or y to fetch
-+                    if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
-+                        // save mask as it is zero'd out after each gather
-+                        vMask = vGatherMask;
-+
-+                        vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
-+                        // e.g. result of first 8x32bit integer gather for 16bit components
-+                        // 256i - 0    1    2    3    4    5    6    7
-+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-+                        //
-+                    }
-+
-+                    // if we have at least one component out of z or w to fetch
-+                    if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
-+                        // offset base to the next components(zw) in the vertex to gather
-+                        pStreamBase = GEP(pStreamBase, C((char)4));
-+                        vMask = vGatherMask;
-+
-+                        vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
-+                        // e.g. result of second 8x32bit integer gather for 16bit components
-+                        // 256i - 0    1    2    3    4    5    6    7
-+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-+                        //
-+                    }
-+
-+                    // if we have at least one component to shuffle into place
-+                    if(compMask){
-+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
-+                                                                      currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
-+                        // Shuffle gathered components into place in simdvertex struct
-+                        Shuffle16bpcGather(args);  // outputs to vVertexElements ref
-+                    }
-+                }
-+                    break;
-+                case 32:
-+                {
-+                    for(uint32_t i = 0; i < 4; i++)
-+                    {
-+                        if(!isComponentEnabled(compMask, i)){
-+                            // offset base to the next component in the vertex to gather
-+                            pStreamBase = GEP(pStreamBase, C((char)4));
-+                            continue;
-+                        }
-+
-+                        // if we need to gather the component
-+                        if(compCtrl[i] == StoreSrc){
-+                            // save mask as it is zero'd out after each gather
-+                            Value *vMask = vGatherMask;
-+
-+                            // Gather a SIMD of vertices
-+                            vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
-+                        }
-+                        else{
-+                            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-+                        }
-+
-+                        if(currentVertexElement > 3){
-+                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-+                            // reset to the next vVertexElement to output
-+                            currentVertexElement = 0;
-+                        }
-+
-+                        // offset base to the next component in the vertex to gather
-+                        pStreamBase = GEP(pStreamBase, C((char)4));
-+                    }
-+                }
-+                    break;
-+                default:
-+                    SWR_ASSERT(0, "Tried to fetch invalid FP format");
-+                    break;
-+            }
-+        }
-+        else
-+        {
-+            Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
-+            ConversionType conversionType = CONVERT_NONE;
-+
-+            switch(info.type[0])
-+            {
-+                case SWR_TYPE_UNORM: 
-+                    conversionType = CONVERT_NORMALIZED;
-+                case SWR_TYPE_UINT:
-+                    extendCastType = Instruction::CastOps::ZExt;
-+                    break;
-+                case SWR_TYPE_SNORM:
-+                    conversionType = CONVERT_NORMALIZED;
-+                case SWR_TYPE_SINT:
-+                    extendCastType = Instruction::CastOps::SExt;
-+                    break;
-+                case SWR_TYPE_USCALED:
-+                    conversionType = CONVERT_USCALED;
-+                    extendCastType = Instruction::CastOps::UIToFP;
-+                    break;
-+                case SWR_TYPE_SSCALED:
-+                    conversionType = CONVERT_SSCALED;
-+                    extendCastType = Instruction::CastOps::SIToFP;
-+                    break;
-+                default:
-+                    break;
-+            }
-+
-+            // value substituted when component of gather is masked
-+            Value* gatherSrc = VIMMED1(0);
-+
-+            // Gather components from memory to store in a simdvertex structure
-+            switch (bpc)
-+            {
-+                case 8:
-+                {
-+                    // if we have at least one component to fetch
-+                    if(compMask){
-+                        Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
-+                        // e.g. result of an 8x32bit integer gather for 8bit components
-+                        // 256i - 0    1    2    3    4    5    6    7
-+                        //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
-+
-+                        Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
-+                                                                     currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
-+                        // Shuffle gathered components into place in simdvertex struct
-+                        Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
-+                    }
-+                }
-+                break;
-+                case 16:
-+                {
-+                    Value* vGatherResult[2];
-+                    Value *vMask;
-+
-+                    // if we have at least one component out of x or y to fetch
-+                    if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
-+                        // save mask as it is zero'd out after each gather
-+                        vMask = vGatherMask;
-+
-+                        vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
-+                        // e.g. result of first 8x32bit integer gather for 16bit components
-+                        // 256i - 0    1    2    3    4    5    6    7
-+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-+                        //
-+                    }
-+
-+                    // if we have at least one component out of z or w to fetch
-+                    if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
-+                        // offset base to the next components(zw) in the vertex to gather
-+                        pStreamBase = GEP(pStreamBase, C((char)4));
-+                        vMask = vGatherMask;
-+
-+                        vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
-+                        // e.g. result of second 8x32bit integer gather for 16bit components
-+                        // 256i - 0    1    2    3    4    5    6    7
-+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-+                        //
-+                    }
-+
-+                    // if we have at least one component to shuffle into place
-+                    if(compMask){
-+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
-+                                                                      currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
-+                        // Shuffle gathered components into place in simdvertex struct
-+                        Shuffle16bpcGather(args);  // outputs to vVertexElements ref
-+                    }
-+                }
-+                break;
-+                case 32:
-+                {
-+                    SWR_ASSERT(conversionType == CONVERT_NONE);
-+
-+                    // Gathered components into place in simdvertex struct
-+                    for(uint32_t i = 0; i < 4; i++)
-+                    {
-+                        if(!isComponentEnabled(compMask, i)){
-+                            // offset base to the next component in the vertex to gather
-+                            pStreamBase = GEP(pStreamBase, C((char)4));
-+                            continue;
-+                        }
-+
-+                        // if we need to gather the component
-+                        if(compCtrl[i] == StoreSrc){
-+                            // save mask as it is zero'd out after each gather
-+                            Value *vMask = vGatherMask;
-+
-+                            vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
-+
-+                            // e.g. result of a single 8x32bit integer gather for 32bit components
-+                            // 256i - 0    1    2    3    4    5    6    7
-+                            //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
-+                        }
-+                        else{
-+                            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-+                        }
-+
-+                        if(currentVertexElement > 3){
-+                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-+                            // reset to the next vVertexElement to output
-+                            currentVertexElement = 0;
-+                        }
-+
-+                        // offset base to the next component  in the vertex to gather
-+                        pStreamBase = GEP(pStreamBase, C((char)4));
-+                    }
-+                }
-+                break;
-+            }
-+        }
-+    }
-+
-+    // if we have a partially filled vVertexElement struct, output it
-+    if(currentVertexElement > 0){
-+        StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Loads a simd of valid indices. OOB indices are set to 0
-+/// *Note* have to do 16bit index checking in scalar until we have AVX-512
-+/// support
-+/// @param pIndices - pointer to 8 bit indices
-+/// @param pLastIndex - pointer to last valid index
-+Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
-+{
-+    // can fit 2 16 bit integers per vWidth lane
-+    Value* vIndices =  VUNDEF_I();
-+
-+    // store 0 index on stack to be used to conditionally load from if index address is OOB
-+    Value* pZeroIndex = ALLOCA(mInt8Ty);
-+    STORE(C((uint8_t)0), pZeroIndex);
-+
-+    // Load a SIMD of index pointers
-+    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
-+    {
-+        // Calculate the address of the requested index
-+        Value *pIndex = GEP(pIndices, C(lane));
-+
-+        // check if the address is less than the max index, 
-+        Value* mask = ICMP_ULT(pIndex, pLastIndex);
-+
-+        // if valid, load the index. if not, load 0 from the stack
-+        Value* pValid = SELECT(mask, pIndex, pZeroIndex);
-+        Value *index = LOAD(pValid, "valid index");
-+
-+        // zero extended index to 32 bits and insert into the correct simd lane
-+        index = Z_EXT(index, mInt32Ty);
-+        vIndices = VINSERT(vIndices, index, lane);
-+    }
-+    return vIndices;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Loads a simd of valid indices. OOB indices are set to 0
-+/// *Note* have to do 16bit index checking in scalar until we have AVX-512
-+/// support
-+/// @param pIndices - pointer to 16 bit indices
-+/// @param pLastIndex - pointer to last valid index
-+Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
-+{
-+    // can fit 2 16 bit integers per vWidth lane
-+    Value* vIndices =  VUNDEF_I();
-+
-+    // store 0 index on stack to be used to conditionally load from if index address is OOB
-+    Value* pZeroIndex = ALLOCA(mInt16Ty);
-+    STORE(C((uint16_t)0), pZeroIndex);
-+
-+    // Load a SIMD of index pointers
-+    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
-+    {
-+        // Calculate the address of the requested index
-+        Value *pIndex = GEP(pIndices, C(lane));
-+
-+        // check if the address is less than the max index, 
-+        Value* mask = ICMP_ULT(pIndex, pLastIndex);
-+
-+        // if valid, load the index. if not, load 0 from the stack
-+        Value* pValid = SELECT(mask, pIndex, pZeroIndex);
-+        Value *index = LOAD(pValid, "valid index");
-+
-+        // zero extended index to 32 bits and insert into the correct simd lane
-+        index = Z_EXT(index, mInt32Ty);
-+        vIndices = VINSERT(vIndices, index, lane);
-+    }
-+    return vIndices;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Loads a simd of valid indices. OOB indices are set to 0
-+/// @param pIndices - pointer to 32 bit indices
-+/// @param pLastIndex - pointer to last valid index
-+Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
-+{
-+    DataLayout dL(JM()->mpCurrentModule);
-+    unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
-+    Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
-+    Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
-+
-+    // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
-+    Value* numIndicesLeft = SUB(iLastIndex,iIndices);
-+    numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
-+    numIndicesLeft = SDIV(numIndicesLeft, C(4));
-+
-+    // create a vector of index counts from the base index ptr passed into the fetch
-+    const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
-+    Constant* vIndexOffsets = ConstantVector::get(vecIndices);
-+
-+    // compare index count to the max valid index
-+    // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
-+    //     vIndexOffsets  0 1 2 3 4 5 6 7
-+    //     ------------------------------
-+    //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
-+    //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
-+    Value* vMaxIndex = VBROADCAST(numIndicesLeft);
-+    Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
-+
-+    // VMASKLOAD takes an *i8 src pointer
-+    pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
-+
-+    // Load the indices; OOB loads 0
-+    return MASKLOADD(pIndices,vIndexMask);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, 
-+/// denormalizes if needed, converts to F32 if needed, and positions in 
-+//  the proper SIMD rows to be output to the simdvertex structure
-+/// @param args: (tuple of args, listed below)
-+///   @param vGatherResult - 8 gathered 8bpc vertices
-+///   @param pVtxOut - base pointer to output simdvertex struct
-+///   @param extendType - sign extend or zero extend
-+///   @param bNormalized - do we need to denormalize?
-+///   @param currentVertexElement - reference to the current vVertexElement
-+///   @param outputElt - reference to the current offset from simdvertex we're o
-+///   @param compMask - component packing mask
-+///   @param compCtrl - component control val
-+///   @param vVertexElements[4] - vertex components to output
-+///   @param swizzle[4] - component swizzle location
-+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
-+{
-+    // Unpack tuple args
-+    Value*& vGatherResult = std::get<0>(args);
-+    Value* pVtxOut = std::get<1>(args);
-+    const Instruction::CastOps extendType = std::get<2>(args);
-+    const ConversionType conversionType = std::get<3>(args);
-+    uint32_t &currentVertexElement = std::get<4>(args);
-+    uint32_t &outputElt =  std::get<5>(args);
-+    const ComponentEnable compMask = std::get<6>(args);
-+    const ComponentControl (&compCtrl)[4] = std::get<7>(args);
-+    Value* (&vVertexElements)[4] = std::get<8>(args);
-+    const uint32_t (&swizzle)[4] = std::get<9>(args);
-+
-+    // cast types
-+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-+    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
-+
-+    // have to do extra work for sign extending
-+    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
-+        Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
-+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
-+
-+        // shuffle mask, including any swizzling
-+        const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
-+        const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
-+        Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
-+                    char(y), char(y+4), char(y+8), char(y+12),
-+                    char(z), char(z+4), char(z+8), char(z+12),
-+                    char(w), char(w+4), char(w+8), char(w+12),
-+                    char(x), char(x+4), char(x+8), char(x+12),
-+                    char(y), char(y+4), char(y+8), char(y+12),
-+                    char(z), char(z+4), char(z+8), char(z+12),
-+                    char(w), char(w+4), char(w+8), char(w+12)});
-+
-+        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
-+        // after pshufb: group components together in each 128bit lane
-+        // 256i - 0    1    2    3    4    5    6    7
-+        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
-+
-+        Value* vi128XY = nullptr;
-+        if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
-+            vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
-+            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
-+            // 256i - 0    1    2    3    4    5    6    7
-+            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
-+        }
-+
-+        // do the same for zw components
-+        Value* vi128ZW = nullptr;
-+        if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
-+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
-+        }
-+
-+        // init denormalize variables if needed
-+        Instruction::CastOps fpCast;
-+        Value* conversionFactor;
-+
-+        switch (conversionType)
-+        {
-+        case CONVERT_NORMALIZED:
-+            fpCast = Instruction::CastOps::SIToFP;
-+            conversionFactor = VIMMED1((float)(1.0 / 127.0));
-+            break;
-+        case CONVERT_SSCALED:
-+            fpCast = Instruction::CastOps::SIToFP;
-+            conversionFactor = VIMMED1((float)(1.0));
-+            break;
-+        case CONVERT_USCALED:
-+            SWR_ASSERT(0, "Type should not be sign extended!");
-+            conversionFactor = nullptr;
-+            break;
-+        default:
-+            SWR_ASSERT(conversionType == CONVERT_NONE);
-+            conversionFactor = nullptr;
-+            break;
-+        }
-+
-+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
-+        for(uint32_t i = 0; i < 4; i++){
-+            if(!isComponentEnabled(compMask, i)){
-+                continue;
-+            }
-+
-+            if(compCtrl[i] == ComponentControl::StoreSrc){
-+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
-+                // if x or y, use vi128XY permute result, else use vi128ZW
-+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-+            
-+                // sign extend
-+                vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
-+
-+                // denormalize if needed
-+                if(conversionType != CONVERT_NONE){
-+                    vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
-+                }
-+                currentVertexElement++;
-+            }
-+            else{
-+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-+            }
-+
-+            if(currentVertexElement > 3){
-+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-+                // reset to the next vVertexElement to output
-+                currentVertexElement = 0;
-+            }
-+        }
-+    }
-+    // else zero extend
-+    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
-+    {
-+        // init denormalize variables if needed
-+        Instruction::CastOps fpCast;
-+        Value* conversionFactor;
-+
-+        switch (conversionType)
-+        {
-+        case CONVERT_NORMALIZED:
-+            fpCast = Instruction::CastOps::UIToFP;
-+            conversionFactor = VIMMED1((float)(1.0 / 255.0));
-+            break;
-+        case CONVERT_USCALED:
-+            fpCast = Instruction::CastOps::UIToFP;
-+            conversionFactor = VIMMED1((float)(1.0));
-+            break;
-+        case CONVERT_SSCALED:
-+            SWR_ASSERT(0, "Type should not be zero extended!");
-+            conversionFactor = nullptr;
-+            break;
-+        default:
-+            SWR_ASSERT(conversionType == CONVERT_NONE);
-+            conversionFactor = nullptr;
-+            break;
-+        }
-+
-+        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
-+        for(uint32_t i = 0; i < 4; i++){
-+            if(!isComponentEnabled(compMask, i)){
-+                continue;
-+            }
-+
-+            if(compCtrl[i] == ComponentControl::StoreSrc){
-+                // pshufb masks for each component
-+                Value* vConstMask;
-+                switch(swizzle[i]){
-+                    case 0:
-+                        // x shuffle mask
-+                        vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-+                                              0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
-+                        break;
-+                    case 1:
-+                        // y shuffle mask
-+                        vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-+                                              1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
-+                        break;
-+                    case 2:
-+                        // z shuffle mask
-+                        vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-+                                              2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
-+                        break;
-+                    case 3:
-+                        // w shuffle mask
-+                        vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-+                                              3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
-+                        break;
-+                    default:
-+                        vConstMask = nullptr;
-+                        break;
-+                }
-+
-+                vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
-+                // after pshufb for x channel
-+                // 256i - 0    1    2    3    4    5    6    7
-+                //        x000 x000 x000 x000 x000 x000 x000 x000 
-+
-+                // denormalize if needed
-+                if (conversionType != CONVERT_NONE){
-+                    vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
-+                }
-+                currentVertexElement++;
-+            }
-+            else{
-+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-+            }
-+
-+            if(currentVertexElement > 3){
-+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-+                // reset to the next vVertexElement to output
-+                currentVertexElement = 0;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        SWR_ASSERT(0, "Unsupported conversion type");
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, 
-+/// denormalizes if needed, converts to F32 if needed, and positions in 
-+//  the proper SIMD rows to be output to the simdvertex structure
-+/// @param args: (tuple of args, listed below)
-+///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
-+///   @param pVtxOut - base pointer to output simdvertex struct
-+///   @param extendType - sign extend or zero extend
-+///   @param bNormalized - do we need to denormalize?
-+///   @param currentVertexElement - reference to the current vVertexElement
-+///   @param outputElt - reference to the current offset from simdvertex we're o
-+///   @param compMask - component packing mask
-+///   @param compCtrl - component control val
-+///   @param vVertexElements[4] - vertex components to output
-+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
-+{
-+    // Unpack tuple args
-+    Value* (&vGatherResult)[2] = std::get<0>(args);
-+    Value* pVtxOut = std::get<1>(args);
-+    const Instruction::CastOps extendType = std::get<2>(args);
-+    const ConversionType conversionType = std::get<3>(args);
-+    uint32_t &currentVertexElement = std::get<4>(args);
-+    uint32_t &outputElt = std::get<5>(args);
-+    const ComponentEnable compMask = std::get<6>(args);
-+    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
-+    Value* (&vVertexElements)[4] = std::get<8>(args);
-+
-+    // cast types
-+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-+    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
-+
-+    // have to do extra work for sign extending
-+    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
-+        (extendType == Instruction::CastOps::FPExt))
-+    {
-+        // is this PP float?
-+        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
-+
-+        Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
-+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
-+
-+        // shuffle mask
-+        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-+                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
-+        Value* vi128XY = nullptr;
-+        if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
-+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
-+            // after pshufb: group components together in each 128bit lane
-+            // 256i - 0    1    2    3    4    5    6    7
-+            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
-+
-+            vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-+            // after PERMD: move and pack xy components into each 128bit lane
-+            // 256i - 0    1    2    3    4    5    6    7
-+            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
-+        }
-+
-+        // do the same for zw components
-+        Value* vi128ZW = nullptr;
-+        if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
-+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
-+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
-+        }
-+
-+        // init denormalize variables if needed
-+        Instruction::CastOps IntToFpCast;
-+        Value* conversionFactor;
-+
-+        switch (conversionType)
-+        {
-+        case CONVERT_NORMALIZED:
-+            IntToFpCast = Instruction::CastOps::SIToFP;
-+            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
-+            break;
-+        case CONVERT_SSCALED:
-+            IntToFpCast = Instruction::CastOps::SIToFP;
-+            conversionFactor = VIMMED1((float)(1.0));
-+            break;
-+        case CONVERT_USCALED:
-+            SWR_ASSERT(0, "Type should not be sign extended!");
-+            conversionFactor = nullptr;
-+            break;
-+        default:
-+            SWR_ASSERT(conversionType == CONVERT_NONE);
-+            conversionFactor = nullptr;
-+            break;
-+        }
-+
-+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
-+        for(uint32_t i = 0; i < 4; i++){
-+            if(!isComponentEnabled(compMask, i)){
-+                continue;
-+            }
-+
-+            if(compCtrl[i] == ComponentControl::StoreSrc){
-+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
-+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-+                // if x or y, use vi128XY permute result, else use vi128ZW
-+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-+                
-+                if(bFP) {
-+                    // extract 128 bit lanes to sign extend each component
-+                    vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
-+                }
-+                else {
-+                    // extract 128 bit lanes to sign extend each component
-+                    vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
-+
-+                    // denormalize if needed
-+                    if(conversionType != CONVERT_NONE){
-+                        vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
-+                    }
-+                }
-+                currentVertexElement++;
-+            }
-+            else{
-+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-+            }
-+
-+            if(currentVertexElement > 3){
-+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-+                // reset to the next vVertexElement to output
-+                currentVertexElement = 0;
-+            }
-+        }
-+
-+    }
-+    // else zero extend
-+    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
-+    {
-+        // pshufb masks for each component
-+        Value* vConstMask[2];
-+        if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
-+            // x/z shuffle mask
-+            vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-+                                     0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
-+        }
-+        
-+        if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
-+            // y/w shuffle mask
-+            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-+                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
-+        }
-+
-+        // init denormalize variables if needed
-+        Instruction::CastOps fpCast;
-+        Value* conversionFactor;
-+
-+        switch (conversionType)
-+        {
-+        case CONVERT_NORMALIZED:
-+            fpCast = Instruction::CastOps::UIToFP;
-+            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
-+            break;
-+        case CONVERT_USCALED:
-+            fpCast = Instruction::CastOps::UIToFP;
-+            conversionFactor = VIMMED1((float)(1.0f));
-+            break;
-+        case CONVERT_SSCALED:
-+            SWR_ASSERT(0, "Type should not be zero extended!");
-+            conversionFactor = nullptr;
-+            break;
-+        default:
-+            SWR_ASSERT(conversionType == CONVERT_NONE);
-+            conversionFactor = nullptr;
-+            break;
-+        }
-+
-+        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
-+        for(uint32_t i = 0; i < 4; i++){
-+            if(!isComponentEnabled(compMask, i)){
-+                continue;
-+            }
-+
-+            if(compCtrl[i] == ComponentControl::StoreSrc){
-+                // select correct constMask for x/z or y/w pshufb
-+                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
-+                // if x or y, use vi128XY permute result, else use vi128ZW
-+                uint32_t selectedGather = (i < 2) ? 0 : 1;
-+
-+                vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-+                // after pshufb mask for x channel; z uses the same shuffle from the second gather
-+                // 256i - 0    1    2    3    4    5    6    7
-+                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
-+
-+                // denormalize if needed
-+                if(conversionType != CONVERT_NONE){
-+                    vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
-+                }
-+                currentVertexElement++;
-+            }
-+            else{
-+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-+            }
-+
-+            if(currentVertexElement > 3){
-+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-+                // reset to the next vVertexElement to output
-+                currentVertexElement = 0;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        SWR_ASSERT(0, "Unsupported conversion type");
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Output a simdvertex worth of elements to the current outputElt
-+/// @param pVtxOut - base address of VIN output struct
-+/// @param outputElt - simdvertex offset in VIN to write to
-+/// @param numEltsToStore - number of simdvertex rows to write out
-+/// @param vVertexElements - LLVM Value*[] simdvertex to write out
-+void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
-+{
-+    for(uint32_t c = 0; c < numEltsToStore; ++c)
-+    {
-+        // STORE expects FP32 x vWidth type, just bitcast if needed
-+        if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
-+#if FETCH_DUMP_VERTEX
-+            PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
-+#endif
-+            vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
-+        }
-+#if FETCH_DUMP_VERTEX
-+        else
-+        {
-+            PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
-+        }
-+#endif
-+        // outputElt * 4 = offsetting by the size of a simdvertex
-+        // + c offsets to a 32bit x vWidth row within the current vertex
-+        Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
-+        STORE(vVertexElements[c], dest);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Generates a constant vector of values based on the 
-+/// ComponentControl value
-+/// @param ctrl - ComponentControl value
-+Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
-+{
-+    switch(ctrl)
-+    {
-+        case NoStore:   return VUNDEF_I();
-+        case Store0:    return VIMMED1(0);
-+        case Store1Fp:  return VIMMED1(1.0f);
-+        case Store1Int: return VIMMED1(1);
-+        case StoreSrc:
-+        default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Returns the enable mask for the specified component.
-+/// @param enableMask - enable bits
-+/// @param component - component to check if enabled.
-+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
-+{
-+    switch (component)
-+    {
-+        // X
-+    case 0: return (enableMask & ComponentEnable::X);
-+        // Y
-+    case 1: return (enableMask & ComponentEnable::Y);
-+        // Z
-+    case 2: return (enableMask & ComponentEnable::Z);
-+        // W
-+    case 3: return (enableMask & ComponentEnable::W);
-+
-+    default: return false;
-+    }
-+}
-+
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JITs from fetch shader IR
-+/// @param hJitMgr - JitManager handle
-+/// @param func   - LLVM function IR
-+/// @return PFN_FETCH_FUNC - pointer to fetch code
-+PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
-+{
-+    const llvm::Function* func = (const llvm::Function*)hFunc;
-+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-+    PFN_FETCH_FUNC pfnFetch;
-+
-+    pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
-+    pJitMgr->mIsModuleFinalized = true;
-+
-+#if defined(KNOB_SWRC_TRACING)
-+    char fName[1024];
-+    const char *funcName = func->getName().data();
-+    sprintf(fName, "%s.bin", funcName);
-+    FILE *fd = fopen(fName, "wb");
-+    fwrite((void *)pfnFetch, 1, 2048, fd);
-+    fclose(fd);
-+#endif
-+
-+    return pfnFetch;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT compiles fetch shader
-+/// @param hJitMgr - JitManager handle
-+/// @param state   - fetch state to build function from
-+extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
-+{
-+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-+
-+    pJitMgr->SetupNewModule();
-+
-+    FetchJit theJit(pJitMgr);
-+    HANDLE hFunc = theJit.Create(state);
-+
-+    return JitFetchFunc(hJitMgr, hFunc);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
-new file mode 100644
-index 0000000..ea3625d
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
-@@ -0,0 +1,128 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file fetch_jit.h
-+*
-+* @brief Definition of the fetch jitter
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/formats.h"
-+#include "core/state.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// INPUT_ELEMENT_DESC
-+//////////////////////////////////////////////////////////////////////////
-+struct INPUT_ELEMENT_DESC
-+{
-+    union
-+    {
-+        struct
-+        {
-+            uint32_t            AlignedByteOffset : 12;
-+            uint32_t            Format : 10;
-+            uint32_t            StreamIndex : 6;
-+            uint32_t            InstanceEnable : 1;
-+            uint32_t            ComponentControl0 : 3;
-+            uint32_t            ComponentControl1 : 3;
-+            uint32_t            ComponentControl2 : 3;
-+            uint32_t            ComponentControl3 : 3;
-+            uint32_t            ComponentPacking : 4;
-+            uint32_t            _reserved : 19;
-+        };
-+        uint64_t bits;
-+    };
-+    uint32_t InstanceDataStepRate;
-+};
-+
-+// used to set ComponentPacking
-+enum ComponentEnable
-+{
-+    NONE = 0x0,
-+    X    = 0x1,
-+    Y    = 0x2,
-+    XY   = 0x3,
-+    Z    = 0x4,
-+    XZ   = 0x5,
-+    YZ   = 0x6,
-+    XYZ  = 0x7,
-+    W    = 0x8,
-+    XW   = 0x9,
-+    YW   = 0xA,
-+    XYW  = 0xB,
-+    ZW   = 0xC,
-+    XZW  = 0xD,
-+    YZW  = 0xE,
-+    XYZW = 0xF,
-+};
-+
-+enum ComponentControl
-+{
-+    NoStore     = 0,
-+    StoreSrc    = 1,
-+    Store0      = 2,
-+    Store1Fp    = 3,
-+    Store1Int   = 4,
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// State required for fetch shader jit compile.
-+//////////////////////////////////////////////////////////////////////////
-+struct FETCH_COMPILE_STATE
-+{
-+    uint32_t numAttribs;
-+    INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES];
-+    SWR_FORMAT indexType;
-+    uint32_t cutIndex{ 0xffffffff };
-+
-+    // Options that effect the JIT'd code
-+    bool bDisableVGATHER;           // if enabled, FetchJit will generate loads/shuffles instead of VGATHERs
-+    bool bDisableIndexOOBCheck;     // if enabled, FetchJit will exclude index OOB check
-+    bool bEnableCutIndex{ false };  // compares indices with the cut index and returns a cut mask
-+
-+    FETCH_COMPILE_STATE(bool useVGATHER = false, bool indexOOBCheck = false) :
-+        bDisableVGATHER(useVGATHER), bDisableIndexOOBCheck(indexOOBCheck){};
-+
-+    bool operator==(const FETCH_COMPILE_STATE &other) const
-+    {
-+        if (numAttribs != other.numAttribs) return false;
-+        if (indexType != other.indexType) return false;
-+        if (bDisableVGATHER != other.bDisableVGATHER) return false;
-+        if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false;
-+        if (bEnableCutIndex != other.bEnableCutIndex) return false;
-+        if (cutIndex != other.cutIndex) return false;
-+
-+        for(uint32_t i = 0; i < numAttribs; ++i)
-+        {
-+            if((layout[i].bits != other.layout[i].bits) ||
-+               ((layout[i].InstanceEnable == 1) &&
-+                (layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){
-+                return false;
-+            }
-+        }
-+
-+        return true;
-+    }
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
-new file mode 100644
-index 0000000..afa33bb
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
-@@ -0,0 +1,105 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file jit_api.h
-+*
-+* @brief Platform independent JIT interface
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#pragma once
-+#include "common/os.h"
-+
-+#include "fetch_jit.h"
-+#include "streamout_jit.h"
-+#include "blend_jit.h"
-+
-+#if defined(_WIN32)
-+#define JITCALL __stdcall
-+#else
-+#define JITCALL
-+#endif
-+
-+extern "C"
-+{
-+
-+struct ShaderInfo;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Create JIT context.
-+HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Destroy JIT context.
-+void JITCALL JitDestroyContext(HANDLE hJitContext);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Jit Compile Info Input
-+//////////////////////////////////////////////////////////////////////////
-+struct JIT_COMPILE_INPUT
-+{
-+    SWR_SHADER_TYPE type;
-+
-+    const void* pIR;        ///< Pointer to LLVM IR text.
-+
-+    bool enableJitSampler;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT compile shader.
-+/// @param hJitContext - Jit Context
-+/// @param input  - Input containing LLVM IR and other information
-+/// @param output - Output containing information about JIT shader
-+/// @return HANDLE - pointer to shader object.
-+HANDLE JITCALL JitCompileShader(
-+    HANDLE hJitContext,
-+    const JIT_COMPILE_INPUT& input,
-+    ShaderInfo& output);     ///@todo Move ShaderInfo into Jitter.
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT destroy shader.
-+/// @param hJitContext - Jit Context
-+/// @param hShader  - pointer to shader object.
-+void JITCALL JitDestroyShader(
-+    HANDLE hJitContext,
-+    HANDLE hShader);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT compiles fetch shader
-+/// @param hJitContext - Jit Context
-+/// @param state   - Fetch state to build function from
-+PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT compiles streamout shader
-+/// @param hJitContext - Jit Context
-+/// @param state   - SO state to build function from
-+PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT compiles blend shader
-+/// @param hJitContext - Jit Context
-+/// @param state   - blend state to build function from
-+PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state);
-+
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
-new file mode 100644
-index 0000000..268871b
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
-@@ -0,0 +1,334 @@
-+# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+#
-+# Permission is hereby granted, free of charge, to any person obtaining a
-+# copy of this software and associated documentation files (the "Software"),
-+# to deal in the Software without restriction, including without limitation
-+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+# and/or sell copies of the Software, and to permit persons to whom the
-+# Software is furnished to do so, subject to the following conditions:
-+#
-+# The above copyright notice and this permission notice (including the next
-+# paragraph) shall be included in all copies or substantial portions of the
-+# Software.
-+#
-+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+# IN THE SOFTWARE.
-+
-+#!deps/python32/python.exe
-+
-+import os, sys, re
-+import argparse
-+import json as JSON
-+import operator
-+
-+header = r"""
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file %s
-+* 
-+* @brief auto-generated file
-+* 
-+* DO NOT EDIT
-+* 
-+******************************************************************************/
-+
-+#pragma once
-+
-+"""
-+
-+"""
-+"""
-+def gen_file_header(filename):
-+    global header
-+    headerStr = header % filename
-+    return headerStr.splitlines()
-+
-+"""
-+"""
-+def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
-+
-+    llvm_type = ''
-+
-+    if is_llvm_struct:
-+        if is_pointer or is_pointer_pointer:
-+            llvm_type = 'Type::getInt32Ty(ctx)'
-+        else:
-+            llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
-+    elif is_llvm_enum:
-+        llvm_type = 'Type::getInt32Ty(ctx)'
-+    elif is_llvm_pfn:
-+        llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
-+    else:
-+        if type == "BYTE" or type == "char" or type == "uint8_t" or type == "int8_t" or type == 'bool':
-+            llvm_type = 'Type::getInt8Ty(ctx)'
-+        elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t':
-+            llvm_type = 'Type::getInt64Ty(ctx)'
-+        elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
-+            llvm_type = 'Type::getInt16Ty(ctx)'
-+        elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
-+            llvm_type = 'Type::getInt32Ty(ctx)'
-+        elif type == 'float' or type == 'FLOAT':
-+            llvm_type = 'Type::getFloatTy(ctx)'
-+        elif type == 'double' or type == 'DOUBLE':
-+            llvm_type = 'Type::getDoubleTy(ctx)'
-+        elif type == 'void' or type == 'VOID':
-+            llvm_type = 'Type::getInt32Ty(ctx)'
-+        elif type == 'HANDLE':
-+            llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
-+        elif type == 'simdscalar':
-+            llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
-+        elif type == 'simdscalari':
-+            llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
-+        elif type == 'simdvector':
-+            llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)'
-+        else:
-+            llvm_type = 'Gen_%s%s(pJitMgr)' % (type, postfix_name)
-+
-+    if is_pointer:
-+        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
-+
-+    if is_pointer_pointer:
-+        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
-+
-+    if is_array_array:
-+        llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
-+    elif is_array:
-+        llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
-+
-+    return ['    members.push_back( %s );    // %s' % (llvm_type, name)]
-+
-+"""
-+"""
-+def gen_llvm_types(input_file, output_file):
-+
-+    output_lines = gen_file_header(os.path.basename(output_file.name))
-+
-+    lines = input_file.readlines()
-+
-+    postfix_name = ""
-+
-+    for idx in range(len(lines)):
-+        line = lines[idx].rstrip()
-+
-+        match = re.match(r"(\s*)struct(\s*)(\w+)", line)
-+        if match:
-+            llvm_args = []
-+
-+             # Detect start of structure
-+            is_fwd_decl = re.search(r";", line)
-+
-+            if not is_fwd_decl:
-+
-+                # Extract the command name
-+                struct_name = match.group(3).strip()
-+
-+                output_lines += [
-+                    '//////////////////////////////////////////////////////////////////////////',
-+                    '/// Generate LLVM type information for %s' % struct_name,
-+                    'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name),
-+                    '{',
-+                    '    LLVMContext& ctx = pJitMgr->mContext;',
-+                    '    std::vector<Type*> members;',
-+                    '',
-+                ]
-+
-+                end_of_struct = False
-+
-+                while not end_of_struct and idx < len(lines)-1:
-+                    idx += 1
-+                    line = lines[idx].rstrip()
-+
-+                    ###########################################
-+                    # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
-+                    is_llvm_struct = re.search(r"@llvm_struct", line)
-+
-+                    if is_llvm_struct is not None:
-+                        is_llvm_struct = True
-+                    else:
-+                        is_llvm_struct = False
-+
-+                    ###########################################
-+                    # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
-+                    is_llvm_enum = re.search(r"@llvm_enum", line)
-+
-+                    if is_llvm_enum is not None:
-+                        is_llvm_enum = True
-+                    else:
-+                        is_llvm_enum = False
-+
-+                    ###########################################
-+                    # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
-+                    is_llvm_pfn = re.search(r"@llvm_pfn", line)
-+
-+                    if is_llvm_pfn is not None:
-+                        is_llvm_pfn = True
-+                    else:
-+                        is_llvm_pfn = False
-+
-+                    ###########################################
-+                    # Is field const?
-+                    is_const = re.search(r"\s+const\s+", line)
-+
-+                    if is_const is not None:
-+                        is_const = True
-+                    else:
-+                        is_const = False
-+
-+                    ###########################################
-+                    # Is field a pointer?
-+                    is_pointer_pointer = re.search("\*\*", line)
-+
-+                    if is_pointer_pointer is not None:
-+                        is_pointer_pointer = True
-+                    else:
-+                        is_pointer_pointer = False
-+
-+                    ###########################################
-+                    # Is field a pointer?
-+                    is_pointer = re.search("\*", line)
-+
-+                    if is_pointer is not None:
-+                        is_pointer = True
-+                    else:
-+                        is_pointer = False
-+
-+                    ###########################################
-+                    # Is field an array of arrays?
-+                    # TODO: Can add this to a list.
-+                    is_array_array = re.search("\[(\w*)\]\[(\w*)\]", line)
-+                    array_count = '0'
-+                    array_count1 = '0'
-+
-+                    if is_array_array is not None:
-+                        array_count = is_array_array.group(1)
-+                        array_count1 = is_array_array.group(2)
-+                        is_array_array = True
-+                    else:
-+                        is_array_array = False
-+
-+                    ###########################################
-+                    # Is field an array?
-+                    is_array = re.search("\[(\w*)\]", line)
-+
-+                    if is_array is not None:
-+                        array_count = is_array.group(1)
-+                        is_array = True
-+                    else:
-+                        is_array = False
-+
-+                    is_scoped = re.search("::", line)
-+
-+                    if is_scoped is not None:
-+                        is_scoped = True
-+                    else:
-+                        is_scoped = False
-+
-+                    type = None
-+                    name = None
-+                    if is_const and is_pointer:
-+
-+                        if is_scoped:
-+                            field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)", line)
-+
-+                            type = "%s%s" % (field_match.group(4), field_match.group(5))
-+                            name = field_match.group(7)
-+                        else:
-+                            field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)", line)
-+
-+                            type = field_match.group(4)
-+                            name = field_match.group(6)
-+
-+                    elif is_pointer:
-+                        field_match = re.match(r"(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)", line)
-+
-+                        if field_match:
-+                            type = field_match.group(3)
-+                            name = field_match.group(5)
-+                    elif is_const:
-+                        field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)", line)
-+
-+                        if field_match:
-+                            type = field_match.group(4)
-+                            name = field_match.group(6)
-+                    else:
-+                        if is_scoped:
-+                            field_match = re.match(r"\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)", line)
-+
-+                            if field_match:
-+                                type = field_match.group(1) + '::' + field_match.group(2)
-+                                name = field_match.group(3)
-+                        else:
-+                            field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)", line)
-+
-+                            if field_match:
-+                                type = field_match.group(2)
-+                                name = field_match.group(4)
-+
-+                    if type is not None:
-+                        output_lines += gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file)
-+                        llvm_args.append(name)
-+
-+                    # Detect end of structure
-+                    end_of_struct = re.match(r"(\s*)};", line)
-+
-+                    if (end_of_struct):
-+                        output_lines += [
-+                            '',
-+                            '    return StructType::get(ctx, members, false);',
-+                            '}',
-+                            '',
-+                        ]
-+
-+                        for i in range(len(llvm_args)):
-+                            output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i))
-+
-+                        output_lines.append('')
-+
-+    output_file.write('\n'.join(output_lines) + '\n')
-+
-+"""
-+    Function which is invoked when this script is started from a command line.
-+    Will present and consume a set of arguments which will tell this script how
-+    to behave
-+"""
-+def main():
-+
-+    # Parse args...
-+    parser = argparse.ArgumentParser()
-+    parser.add_argument("--input", "-i", type=argparse.FileType('r'),
-+            help="Path to input file containing structs", required=True)
-+    parser.add_argument("--output", "-o", type=argparse.FileType('w'),
-+            help="Path to output file", required=True)
-+    parser.add_argument("--scalar", "-scalar", help="Generates scalar files with all enums", action="store_true", default=False)
-+    args = parser.parse_args()
-+
-+    gen_llvm_types(args.input, args.output)
-+
-+if __name__ == '__main__':
-+    main()
-+# END OF FILE
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
-new file mode 100644
-index 0000000..6a64a1c
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
-@@ -0,0 +1,348 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file streamout_jit.cpp
-+*
-+* @brief Implementation of the streamout jitter
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#include "jit_api.h"
-+#include "streamout_jit.h"
-+#include "builder.h"
-+#include "state_llvm.h"
-+#include "common/containers.hpp"
-+#include "llvm/IR/DataLayout.h"
-+
-+#include <sstream>
-+#include <unordered_set>
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Interface to Jitting a fetch shader
-+//////////////////////////////////////////////////////////////////////////
-+struct StreamOutJit : public Builder
-+{
-+    StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
-+
-+    // returns pointer to SWR_STREAMOUT_BUFFER 
-+    Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
-+    {
-+        return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
-+    }
-+
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    // @brief checks if streamout buffer is oob
-+    // @return <i1> true/false
-+    Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
-+    {
-+        Value* returnMask = C(false);
-+
-+        Value* pBuf = getSOBuffer(pSoCtx, buffer);
-+
-+        // load enable
-+        // @todo bool data types should generate <i1> llvm type
-+        Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
-+
-+        // load buffer size
-+        Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
-+        
-+        // load current streamOffset
-+        Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
-+
-+        // load buffer pitch
-+        Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
-+
-+        // buffer is considered oob if in use in a decl but not enabled
-+        returnMask = OR(returnMask, NOT(enabled));
-+
-+        // buffer is oob if cannot fit a prims worth of verts
-+        Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
-+        returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
-+
-+        return returnMask;
-+    }
-+
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
-+    //        packing the active mask bits
-+    //        ex. bitmask 0011 -> (0, 1, 0, 0)
-+    //            bitmask 1000 -> (3, 0, 0, 0)
-+    //            bitmask 1100 -> (2, 3, 0, 0)
-+    Value* PackMask(uint32_t bitmask)
-+    {
-+        std::vector<Constant*> indices(4, C(0));
-+        DWORD index;
-+        uint32_t elem = 0;
-+        while (_BitScanForward(&index, bitmask))
-+        {
-+            indices[elem++] = C((int)index);
-+            bitmask &= ~(1 << index);
-+        }
-+
-+        return ConstantVector::get(indices);
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    // @brief convert scalar bitmask to <4xfloat> bitmask
-+    Value* ToMask(uint32_t bitmask)
-+    {
-+        std::vector<Constant*> indices;
-+        for (uint32_t i = 0; i < 4; ++i)
-+        {
-+            if (bitmask & (1 << i))
-+            {
-+                indices.push_back(C(-1.0f));
-+            }
-+            else
-+            {
-+                indices.push_back(C(0.0f));
-+            }
-+        }
-+        return ConstantVector::get(indices);
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    // @brief processes a single decl from the streamout stream. Reads 4 components from the input
-+    //        stream and writes N components to the output buffer given the componentMask or if
-+    //        a hole, just increments the buffer pointer
-+    // @param pStream - pointer to current attribute
-+    // @param pOutBuffers - pointers to the current location of each output buffer
-+    // @param decl - input decl
-+    void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
-+    {
-+        // @todo add this to x86 macros
-+        Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
-+
-+        uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
-+        uint32_t packedMask = (1 << numComponents) - 1;
-+        if (!decl.hole)
-+        {
-+            // increment stream pointer to correct slot
-+            Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
-+
-+            // load 4 components from stream
-+            Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
-+            Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
-+            pAttrib = BITCAST(pAttrib, simd4PtrTy);
-+            Value *vattrib = LOAD(pAttrib);
-+
-+            // shuffle/pack enabled components
-+            Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
-+
-+            // store to output buffer
-+            // cast SO buffer to i8*, needed by maskstore
-+            Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
-+
-+            // cast input to <4xfloat>
-+            Value* src = BITCAST(vpackedAttrib, simd4Ty);
-+            CALL3(maskStore, pOut, ToMask(packedMask), src);
-+        }
-+
-+        // increment SO buffer
-+        pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    // @brief builds a single vertex worth of data for the given stream
-+    // @param streamState - state for this stream
-+    // @param pCurVertex - pointer to src stream vertex data
-+    // @param pOutBuffer - pointers to up to 4 SO buffers
-+    void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
-+    {
-+        for (uint32_t d = 0; d < streamState.numDecls; ++d)
-+        {
-+            const STREAMOUT_DECL& decl = streamState.decl[d];
-+            buildDecl(pCurVertex, pOutBuffer, decl);
-+        }
-+    }
-+
-+    void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
-+    {
-+        // get list of active SO buffers
-+        std::unordered_set<uint32_t> activeSOBuffers;
-+        for (uint32_t d = 0; d < streamState.numDecls; ++d)
-+        {
-+            const STREAMOUT_DECL& decl = streamState.decl[d];
-+            activeSOBuffers.insert(decl.bufferIndex);
-+        }
-+
-+        // always increment numPrimStorageNeeded
-+        Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
-+        numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
-+        STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
-+
-+        // check OOB on active SO buffers.  If any buffer is out of bound, don't write
-+        // the primitive to any buffer
-+        Value* oobMask = C(false);
-+        for (uint32_t buffer : activeSOBuffers)
-+        {
-+            oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
-+        }
-+
-+        BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
-+
-+        // early out if OOB
-+        COND_BR(oobMask, returnBB, validBB);
-+
-+        IRB()->SetInsertPoint(validBB);
-+
-+        Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
-+        numPrimsWritten = ADD(numPrimsWritten, C(1));
-+        STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
-+
-+        // compute start pointer for each output buffer
-+        Value* pOutBuffer[4];
-+        Value* pOutBufferStartVertex[4];
-+        Value* outBufferPitch[4];
-+        for (uint32_t b: activeSOBuffers)
-+        {
-+            Value* pBuf = getSOBuffer(pSoCtx, b);
-+            Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
-+            Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
-+            pOutBuffer[b] = GEP(pData, streamOffset);
-+            pOutBufferStartVertex[b] = pOutBuffer[b];
-+
-+            outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
-+        }
-+
-+        // loop over the vertices of the prim
-+        Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
-+        for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
-+        {
-+            buildVertex(streamState, pStreamData, pOutBuffer);
-+
-+            // increment stream and output buffer pointers
-+            // stream verts are always 32*4 dwords apart
-+            pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
-+
-+            // output buffers offset using pitch in buffer state
-+            for (uint32_t b : activeSOBuffers)
-+            {
-+                pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
-+                pOutBuffer[b] = pOutBufferStartVertex[b];
-+            }
-+        }
-+
-+        // update each active buffer's streamOffset
-+        for (uint32_t b : activeSOBuffers)
-+        {
-+            Value* pBuf = getSOBuffer(pSoCtx, b);
-+            Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
-+            streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
-+            STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
-+        }
-+    }
-+
-+    Function* Create(const STREAMOUT_COMPILE_STATE& state)
-+    {
-+        static std::size_t soNum = 0;
-+
-+        std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
-+        fnName << soNum++;
-+
-+        // SO function signature
-+        // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
-+
-+        std::vector<Type*> args{
-+            PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
-+        };
-+
-+        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
-+        Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
-+
-+        // create return basic block
-+        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
-+        BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
-+
-+        IRB()->SetInsertPoint(entry);
-+
-+        // arguments
-+        auto argitr = soFunc->getArgumentList().begin();
-+        Value* pSoCtx = argitr++;
-+        pSoCtx->setName("pSoCtx");
-+
-+        const STREAMOUT_STREAM& streamState = state.stream;
-+        buildStream(state, streamState, pSoCtx, returnBB, soFunc);
-+
-+        BR(returnBB);
-+
-+        IRB()->SetInsertPoint(returnBB);
-+        RET_VOID();
-+
-+        JitManager::DumpToFile(soFunc, "SoFunc");
-+
-+        FunctionPassManager passes(JM()->mpCurrentModule);
-+        passes.add(createBreakCriticalEdgesPass());
-+        passes.add(createCFGSimplificationPass());
-+        passes.add(createEarlyCSEPass());
-+        passes.add(createPromoteMemoryToRegisterPass());
-+        passes.add(createCFGSimplificationPass());
-+        passes.add(createEarlyCSEPass());
-+        passes.add(createInstructionCombiningPass());
-+        passes.add(createInstructionSimplifierPass());
-+        passes.add(createConstantPropagationPass());
-+        passes.add(createSCCPPass());
-+        passes.add(createAggressiveDCEPass());
-+
-+        passes.run(*soFunc);
-+
-+        JitManager::DumpToFile(soFunc, "SoFunc_optimized");
-+
-+        return soFunc;
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JITs from streamout shader IR
-+/// @param hJitMgr - JitManager handle
-+/// @param func   - LLVM function IR
-+/// @return PFN_SO_FUNC - pointer to SOS function
-+PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
-+{
-+    const llvm::Function *func = (const llvm::Function*)hFunc;
-+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-+    PFN_SO_FUNC pfnStreamOut;
-+    pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
-+    pJitMgr->mIsModuleFinalized = true;
-+
-+    return pfnStreamOut;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief JIT compiles streamout shader
-+/// @param hJitMgr - JitManager handle
-+/// @param state   - SO state to build function from
-+extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
-+{
-+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-+
-+    pJitMgr->SetupNewModule();
-+
-+    StreamOutJit theJit(pJitMgr);
-+    HANDLE hFunc = theJit.Create(state);
-+
-+    return JitStreamoutFunc(hJitMgr, hFunc);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
-new file mode 100644
-index 0000000..4372a9d
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
-@@ -0,0 +1,91 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file streamout_jit.h
-+*
-+* @brief Definition of the streamout jitter
-+*
-+* Notes:
-+*
-+******************************************************************************/
-+#pragma once
-+
-+#include "common/formats.h"
-+#include "core/state.h"
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// STREAMOUT_DECL - Stream decl
-+//////////////////////////////////////////////////////////////////////////
-+struct STREAMOUT_DECL
-+{
-+    // Buffer that stream maps to.
-+    DWORD bufferIndex;
-+
-+    // attribute to stream
-+    uint32_t attribSlot;
-+
-+    // attribute component mask 
-+    uint32_t componentMask;
-+
-+    // indicates this decl is a hole
-+    bool hole;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// STREAMOUT_STREAM - Stream decls
-+//////////////////////////////////////////////////////////////////////////
-+struct STREAMOUT_STREAM
-+{
-+    // numnber of decls for this stream
-+    uint32_t numDecls;
-+
-+    // array of numDecls decls
-+    STREAMOUT_DECL decl[128];
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// State required for streamout jit
-+//////////////////////////////////////////////////////////////////////////
-+struct STREAMOUT_COMPILE_STATE
-+{
-+    // number of verts per primitive
-+    uint32_t numVertsPerPrim;
-+
-+    // stream decls
-+    STREAMOUT_STREAM stream;
-+
-+    bool operator==(const STREAMOUT_COMPILE_STATE &other) const
-+    {
-+        if (numVertsPerPrim != other.numVertsPerPrim) return false;
-+        if (stream.numDecls != other.stream.numDecls) return false;
-+
-+        for (uint32_t i = 0; i < stream.numDecls; ++i)
-+        {
-+            if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false;
-+            if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false;
-+            if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false;
-+            if (stream.decl[i].hole != other.stream.decl[i].hole) return false;
-+        }
-+
-+        return true;
-+    }
-+};
-diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
-new file mode 100644
-index 0000000..ad73cd8
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
-@@ -0,0 +1,287 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+*
-+* @file ClearTile.cpp
-+*
-+* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro
-+*        tile in the destination.
-+*
-+******************************************************************************/
-+#include "common/os.h"
-+#include "core/context.h"
-+#include "common/formats.h"
-+#include "memory/TilingFunctions.h"
-+#include "memory/tilingtraits.h"
-+#include "memory/Convert.h"
-+
-+typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Clear Raster Tile Function Tables.
-+//////////////////////////////////////////////////////////////////////////
-+static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS];
-+
-+static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS];
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StoreRasterTileClear
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct StoreRasterTileClear
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pColor - Pointer to clear color.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void StoreClear(
-+        const BYTE* dstFormattedColor,
-+        UINT dstBytesPerPixel,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
-+    {
-+        // Compute destination address for raster tile.
-+        BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
-+            (y * pDstSurface->pitch) + (x * dstBytesPerPixel);
-+
-+        // start of first row
-+        BYTE* pDst = pDstTile;
-+        UINT dstBytesPerRow = 0;
-+
-+        // For each raster tile pixel in row 0 (rx, 0)
-+        for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < pDstSurface->width); ++rx)
-+        {
-+            memcpy(pDst, dstFormattedColor, dstBytesPerPixel);
-+
-+            // Increment pointer to next pixel in row.
-+            pDst += dstBytesPerPixel;
-+            dstBytesPerRow += dstBytesPerPixel;
-+        }
-+
-+        // start of second row
-+        pDst = pDstTile + pDstSurface->pitch;
-+
-+        // For each remaining row in the rest of the raster tile
-+        for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < pDstSurface->height); ++ry)
-+        {
-+            // copy row
-+            memcpy(pDst, pDstTile, dstBytesPerRow);
-+
-+            // Increment pointer to first pixel in next row.
-+            pDst += pDstSurface->pitch;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles.
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct StoreMacroTileClear
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores a macrotile to the destination surface.
-+    /// @param pColor - Pointer to color to write to pixels.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to macro tile
-+    static void StoreClear(
-+        const FLOAT *pColor,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        UINT x, UINT y)
-+    {
-+        UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
-+
-+        BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
-+
-+        FLOAT srcColor[4];
-+
-+        for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
-+        {
-+            srcColor[comp] = pColor[FormatTraits<DstFormat>::swizzle(comp)];
-+        }
-+
-+        // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value
-+        ConvertPixelFromFloat<DstFormat>(dstFormattedColor, srcColor);
-+
-+        // Store each raster tile from the hot tile to the destination surface.
-+        // TODO:  Put in check for partial coverage on x/y -- SWR_ASSERT if it happens.
-+        //        Intent is for this function to only handle full tiles.
-+        for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-+        {
-+            for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-+            {
-+                StoreRasterTileClear<SrcFormat, DstFormat>::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row));
-+            }
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Writes clear color to every pixel of a render surface
-+/// @param hPrivateContext - Handle to private DC
-+/// @param renderTargetIndex - Index to destination render target
-+/// @param x, y - Coordinates to raster tile.
-+/// @param pClearColor - Pointer to clear color
-+void StoreHotTileClear(
-+    SWR_SURFACE_STATE *pDstSurface,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    UINT x,
-+    UINT y,
-+    const float* pClearColor)
-+{
-+    PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL;
-+
-+    SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL);  ///@todo Not supported yet.
-+
-+    if (renderTargetIndex != SWR_ATTACHMENT_DEPTH)
-+    {
-+        pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format];
-+    }
-+    else
-+    {
-+        pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format];
-+    }
-+
-+    SWR_ASSERT(pfnStoreTilesClear != NULL);
-+
-+    // Store a macro tile.
-+    /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress.
-+    if (pfnStoreTilesClear != NULL)
-+    {
-+        pfnStoreTilesClear(pClearColor, pDstSurface, x, y);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
-+#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \
-+    memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \
-+    \
-+    sStoreTilesClearColorTable[R32G32B32A32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32B32A32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32B32A32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32B32X32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32B32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32B32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32B32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16A16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16A16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16A16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16A16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16A16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32G32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16X16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16X16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[B8G8R8A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R10G10B10A2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R10G10B10A2_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8A8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8A8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8A8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[B10G10R10A2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R11G11B10_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[A32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A32_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[B8G8R8X8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8X8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[B10G10R10X2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B5G6R5_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[B5G5R5A1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[B4G4R4A4_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[A16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[A16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[B5G5R5X1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC3_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC4_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC5_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[BC2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[BC3_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC4_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[BC5_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_FLOAT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R16G16B16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R10G10B10A2_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[R10G10B10A2_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[B10G10R10A2_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreClear; \
-+    sStoreTilesClearColorTable[B10G10R10A2_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[B10G10R10A2_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \
-+    sStoreTilesClearColorTable[R8G8B8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear; \
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
-+#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \
-+    memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \
-+    \
-+    sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \
-+    sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Sets up tables for ClearTile
-+void InitSimClearTilesTable()
-+{
-+    INIT_STORE_TILES_CLEAR_COLOR_TABLE();
-+    INIT_STORE_TILES_CLEAR_DEPTH_TABLE();
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
-new file mode 100644
-index 0000000..0f9e0ad
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
-@@ -0,0 +1,698 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file Convert.h
-+* 
-+* @brief Conversion utility functions
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+#if defined(_WIN32)
-+// disable "potential divide by 0"
-+#pragma warning(disable: 4723)
-+#endif
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
-+///        float
-+/// @param val - 16-bit float
-+/// @todo Maybe move this outside of this file into a header?
-+static float ConvertSmallFloatTo32(UINT val)
-+{
-+    UINT result;
-+    if ((val & 0x7fff) == 0)
-+    {
-+        result = ((uint32_t)(val & 0x8000)) << 16;
-+    }
-+    else if ((val & 0x7c00) == 0x7c00)
-+    {
-+        result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
-+        result |= ((uint32_t)val & 0x8000) << 16;
-+    }
-+    else
-+    {
-+        uint32_t sign = (val & 0x8000) << 16;
-+        uint32_t mant = (val & 0x3ff) << 13;
-+        uint32_t exp = (val >> 10) & 0x1f;
-+        if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
-+        {
-+            mant <<= 1;
-+            while (mant < (0x400 << 13))
-+            {
-+                exp--;
-+                mant <<= 1;
-+            }
-+            mant &= (0x3ff << 13);
-+        }
-+        exp = ((exp - 15 + 127) & 0xff) << 23;
-+        result = sign | exp | mant;
-+    }
-+
-+    return *(float*)&result;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Convert an IEEE 754 32-bit single precision float to an 
-+///        unsigned small float with 5 exponent bits and a variable
-+///        number of mantissa bits.
-+/// @param val - 32-bit float
-+/// @todo Maybe move this outside of this file into a header?
-+template<UINT numMantissaBits>
-+static UINT Convert32ToSmallFloat(float val)
-+{
-+    uint32_t sign, exp, mant;
-+    uint32_t roundBits;
-+
-+    // Extract the sign, exponent, and mantissa
-+    UINT uf = *(UINT*)&val;
-+
-+    sign = (uf & 0x80000000) >> 31;
-+    exp = (uf & 0x7F800000) >> 23;
-+    mant = uf & 0x007FFFFF;
-+
-+    // 10/11 bit floats are unsigned.  Negative values are clamped to 0.
-+    if (sign != 0)
-+    {
-+        exp = mant = 0;
-+    }
-+    // Check for out of range
-+    else if ((exp == 0xFF) && (mant != 0)) // NaN
-+    {
-+        exp = 0x1F;
-+        mant = 1 << numMantissaBits;
-+    }
-+    else if ((exp == 0xFF) && (mant == 0)) // INF
-+    {
-+        exp = 0x1F;
-+        mant = 0;
-+    }
-+    else if (exp > (0x70 + 0x1E)) // Too big to represent
-+    {
-+        exp = 0x1Eu;
-+        mant = (1 << numMantissaBits) - 1;  // 0x3F for 6 bit mantissa.
-+    }
-+    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
-+    {
-+        mant |= 0x00800000;
-+        for (; exp <= 0x70; mant >>= 1, exp++)
-+            ;
-+        exp = 0;
-+        mant = mant >> (23 - numMantissaBits);
-+    }
-+    else if (exp < 0x66) // Too small to represent -> Zero
-+    {
-+        exp = 0;
-+        mant = 0;
-+    }
-+    else
-+    {
-+        // Saves bits that will be shifted off for rounding
-+        roundBits = mant & 0x1FFFu;
-+        // convert exponent and mantissa to 16 bit format
-+        exp = exp - 0x70u;
-+        mant = mant >> (23 - numMantissaBits);
-+
-+        // Essentially RTZ, but round up if off by only 1 lsb
-+        if (roundBits == 0x1FFFu)
-+        {
-+            mant++;
-+            // check for overflow
-+            if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits)
-+                exp++;
-+            // make sure only the needed bits are used
-+            mant &= (1 << numMantissaBits) - 1;
-+        }
-+    }
-+
-+    UINT tmpVal = (exp << numMantissaBits) | mant;
-+    return tmpVal;
-+}
-+
-+#if KNOB_ARCH == KNOB_ARCH_AVX
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Convert an IEEE 754 32-bit single precision float to an
-+///        16 bit float with 5 exponent bits and a variable
-+///        number of mantissa bits.
-+/// @param val - 32-bit float
-+/// @todo Maybe move this outside of this file into a header?
-+static uint16_t Convert32To16Float(float val)
-+{
-+    uint32_t sign, exp, mant;
-+    uint32_t roundBits;
-+
-+    // Extract the sign, exponent, and mantissa
-+    uint32_t uf = *(uint32_t*)&val;
-+    sign = (uf & 0x80000000) >> 31;
-+    exp = (uf & 0x7F800000) >> 23;
-+    mant = uf & 0x007FFFFF;
-+
-+    // Check for out of range
-+    if (std::isnan(val))
-+    {
-+        exp = 0x1F;
-+        mant = 0x200;
-+        sign = 1;                     // set the sign bit for NANs
-+    }
-+    else if (std::isinf(val))
-+    {
-+        exp = 0x1f;
-+        mant = 0x0;
-+    }
-+    else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
-+    {
-+        exp = 0x1E;
-+        mant = 0x3FF;
-+    }
-+    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
-+    {
-+        mant |= 0x00800000;
-+        for (; exp <= 0x70; mant >>= 1, exp++)
-+            ;
-+        exp = 0;
-+        mant = mant >> 13;
-+    }
-+    else if (exp < 0x66) // Too small to represent -> Zero
-+    {
-+        exp = 0;
-+        mant = 0;
-+    }
-+    else
-+    {
-+        // Saves bits that will be shifted off for rounding
-+        roundBits = mant & 0x1FFFu;
-+        // convert exponent and mantissa to 16 bit format
-+        exp = exp - 0x70;
-+        mant = mant >> 13;
-+
-+        // Essentially RTZ, but round up if off by only 1 lsb
-+        if (roundBits == 0x1FFFu)
-+        {
-+            mant++;
-+            // check for overflow
-+            if ((mant & 0xC00u) != 0)
-+                exp++;
-+            // make sure only the needed bits are used
-+            mant &= 0x3FF;
-+        }
-+    }
-+
-+    uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
-+    return (uint16_t)tmpVal;
-+}
-+#endif
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Retrieve color from hot tile source which is always float.
-+/// @param pDstPixel - Pointer to destination pixel.
-+/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
-+template<SWR_FORMAT DstFormat>
-+static void ConvertPixelFromFloat(
-+    BYTE* pDstPixel,
-+    const float srcPixel[4])
-+{
-+    UINT outColor[4];  // typeless bits
-+
-+    // Store component
-+    for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
-+    {
-+        SWR_TYPE type = FormatTraits<DstFormat>::GetType(comp);
-+
-+        float src = srcPixel[comp];
-+
-+        switch (type)
-+        {
-+        case SWR_TYPE_UNORM:
-+        {
-+            // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
-+            src = (src != src) ? 0.0f : src;
-+
-+            // Clamp [0, 1]
-+            src = std::max(src, 0.0f);
-+            src = std::min(src, 1.0f);
-+
-+            // SRGB
-+            if (FormatTraits<DstFormat>::isSRGB && comp != 3)
-+            {
-+                src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f);
-+            }
-+
-+            // Float scale to integer scale.
-+            UINT scale = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;
-+            src = (float)scale * src;
-+            src = roundf(src);
-+            outColor[comp] = (UINT)src; // Drop fractional part.
-+            break;
-+        }
-+        case SWR_TYPE_SNORM:
-+        {
-+            SWR_ASSERT(!FormatTraits<DstFormat>::isSRGB);
-+
-+            // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
-+            src = (src != src) ? 0.0f : src;
-+
-+            // Clamp [-1, 1]
-+            src = std::max(src, -1.0f);
-+            src = std::min(src, 1.0f);
-+
-+            // Float scale to integer scale.
-+            UINT scale = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
-+            src = (float)scale * src;
-+
-+            // Round
-+            src += (src >= 0) ? 0.5f : -0.5f;
-+
-+            INT out = (INT)src;
-+
-+            outColor[comp] = *(UINT*)&out;
-+
-+            break;
-+        }
-+        case SWR_TYPE_UINT:
-+        {
-+            ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float.
-+            //       However, the number in the hot tile should be unsigned integer. So doing this
-+            //       to preserve bits intead of doing a float -> integer conversion.
-+            if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
-+            {
-+                outColor[comp] = *(UINT*)&src;
-+            }
-+            else
-+            {
-+                outColor[comp] = *(UINT*)&src;
-+                UINT max = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;  // 2^numBits - 1
-+
-+                outColor[comp] = std::min(max, outColor[comp]);
-+            }
-+            break;
-+        }
-+        case SWR_TYPE_SINT:
-+        {
-+            if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
-+            {
-+                outColor[comp] = *(UINT*)&src;
-+            }
-+            else
-+            {
-+                INT out = *(INT*)&src;  // Hot tile format is SINT?
-+                INT max = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
-+                INT min = -1 - max;
-+
-+                ///@note The output is unsigned integer (bag of bits) and so performing
-+                //       the clamping here based on range of output component. Also, manually adding
-+                //       the sign bit in the appropriate spot. Maybe a better way?
-+                out = std::max(out, min);
-+                out = std::min(out, max);
-+
-+                outColor[comp] = *(UINT*)&out;
-+            }
-+            break;
-+        }
-+        case SWR_TYPE_FLOAT:
-+        {
-+            if (FormatTraits<DstFormat>::GetBPC(comp) == 16)
-+            {
-+                // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph
-+                // @todo 16bit float instruction support is orthogonal to avx support.  need to
-+                // add check for F16C support instead.
-+#if KNOB_ARCH == KNOB_ARCH_AVX2
-+                __m128 src128 = _mm_set1_ps(src);
-+                __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC);
-+                UINT value = _mm_extract_epi16(srci128, 0);
-+#else
-+                UINT value = Convert32To16Float(src);
-+#endif
-+
-+                outColor[comp] = value;
-+            }
-+            else if (FormatTraits<DstFormat>::GetBPC(comp) == 11)
-+            {
-+                outColor[comp] = Convert32ToSmallFloat<6>(src);
-+            }
-+            else if (FormatTraits<DstFormat>::GetBPC(comp) == 10)
-+            {
-+                outColor[comp] = Convert32ToSmallFloat<5>(src);
-+            }
-+            else
-+            {
-+                outColor[comp] = *(UINT*)&src;
-+            }
-+
-+            break;
-+        }
-+        default:
-+            SWR_ASSERT(0);
-+            break;
-+        }
-+    }
-+
-+    typename FormatTraits<DstFormat>::FormatT* pPixel = (typename FormatTraits<DstFormat>::FormatT*)pDstPixel;
-+
-+    switch (FormatTraits<DstFormat>::numComps)
-+    {
-+    case 4:
-+        pPixel->a = outColor[3];
-+    case 3:
-+        pPixel->b = outColor[2];
-+    case 2:
-+        pPixel->g = outColor[1];
-+    case 1:
-+        pPixel->r = outColor[0];
-+        break;
-+    default:
-+        SWR_ASSERT(0);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Convert pixel in any format to float32
-+/// @param pDstPixel - Pointer to destination pixel.
-+/// @param srcPixel - Pointer to source pixel
-+template<SWR_FORMAT SrcFormat>
-+INLINE static void ConvertPixelToFloat(
-+    float dstPixel[4],
-+    const BYTE* pSrc)
-+{
-+    UINT srcColor[4];  // typeless bits
-+
-+    // unpack src pixel
-+    typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
-+
-+    // apply format defaults
-+    for (uint32_t comp = 0; comp < 4; ++comp)
-+    {
-+        uint32_t def = FormatTraits<SrcFormat>::GetDefault(comp);
-+        dstPixel[comp] = *(float*)&def;
-+    }
-+
-+    // load format data
-+    switch (FormatTraits<SrcFormat>::numComps)
-+    {
-+    case 4:
-+        srcColor[3] = pPixel->a;
-+    case 3:
-+        srcColor[2] = pPixel->b;
-+    case 2:
-+        srcColor[1] = pPixel->g;
-+    case 1:
-+        srcColor[0] = pPixel->r;
-+        break;
-+    default:
-+        SWR_ASSERT(0);
-+    }
-+
-+    // Convert components
-+    for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
-+    {
-+        SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
-+
-+        UINT src = srcColor[comp];
-+
-+        switch (type)
-+        {
-+        case SWR_TYPE_UNORM:
-+        {
-+            float dst;
-+            if (FormatTraits<SrcFormat>::isSRGB && comp != 3)
-+            {
-+                dst = *(float*)&srgb8Table[src];
-+            }
-+            else
-+            {
-+                // component sizes > 16 must use fp divide to maintain ulp requirements
-+                if (FormatTraits<SrcFormat>::GetBPC(comp) > 16)
-+                {
-+                    dst = (float)src / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1);
-+                }
-+                else
-+                {
-+                    const float scale = (1.0f / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1));
-+                    dst = (float)src * scale;
-+                }
-+            }
-+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
-+            break;
-+        }
-+        case SWR_TYPE_SNORM:
-+        {
-+            SWR_ASSERT(!FormatTraits<SrcFormat>::isSRGB);
-+
-+            float dst;
-+            if (src == 0x10)
-+            {
-+                dst = -1.0f;
-+            }
-+            else
-+            {
-+                switch (FormatTraits<SrcFormat>::GetBPC(comp))
-+                {
-+                case 8:
-+                    dst = (float)((int8_t)src);
-+                    break;
-+                case 16:
-+                    dst = (float)((int16_t)src);
-+                    break;
-+                case 32:
-+                    dst = (float)((int32_t)src);
-+                    break;
-+                default:
-+                    assert(0 && "attempted to load from SNORM with unsupported bpc");
-+                    dst = 0.0f;
-+                    break;
-+                }
-+                dst = dst * (1.0f / ((1 << (FormatTraits<SrcFormat>::GetBPC(comp) - 1)) - 1));
-+            }
-+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
-+            break;
-+        }
-+        case SWR_TYPE_UINT:
-+        {
-+            UINT dst = (UINT)src;
-+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
-+            break;
-+        }
-+        case SWR_TYPE_SINT:
-+        {
-+            int dst;
-+            switch (FormatTraits<SrcFormat>::GetBPC(comp))
-+            {
-+            case 8:
-+                dst = (int8_t)src;
-+                break;
-+            case 16:
-+                dst = (int16_t)src;
-+                break;
-+            case 32:
-+                dst = (int32_t)src;
-+                break;
-+            default:
-+                assert(0 && "attempted to load from SINT with unsupported bpc");
-+                dst = 0;
-+                break;
-+            }
-+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
-+            break;
-+        }
-+        case SWR_TYPE_FLOAT:
-+        {
-+            float dst;
-+            if (FormatTraits<SrcFormat>::GetBPC(comp) == 16)
-+            {
-+#if KNOB_ARCH == KNOB_ARCH_AVX2
-+                // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps
-+                // @todo 16bit float instruction support is orthogonal to avx support.  need to
-+                // add check for F16C support instead.
-+                __m128i src128 = _mm_set1_epi32(src);
-+                __m128 res = _mm_cvtph_ps(src128);
-+                _mm_store_ss(&dst, res);
-+#else
-+                dst = ConvertSmallFloatTo32(src);
-+#endif
-+            }
-+            else if (FormatTraits<SrcFormat>::GetBPC(comp) == 11)
-+            {
-+                dst = ConvertSmallFloatTo32(src << 4);
-+            }
-+            else if (FormatTraits<SrcFormat>::GetBPC(comp) == 10)
-+            {
-+                dst = ConvertSmallFloatTo32(src << 5);
-+            }
-+            else
-+            {
-+                dst = *(float*)&src;
-+            }
-+
-+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
-+            break;
-+        }
-+        default:
-+            SWR_ASSERT(0);
-+            break;
-+        }
-+    }
-+}
-+
-+// non-templated version of conversion functions
-+INLINE static void ConvertPixelFromFloat(
-+    SWR_FORMAT format,
-+    uint8_t* pDst,
-+    const float srcPixel[4])
-+{
-+    switch (format)
-+    {
-+    case R32G32B32A32_FLOAT: ConvertPixelFromFloat<R32G32B32A32_FLOAT>(pDst, srcPixel); break;
-+    case R32G32B32A32_SINT: ConvertPixelFromFloat<R32G32B32A32_SINT>(pDst, srcPixel); break;
-+    case R32G32B32A32_UINT: ConvertPixelFromFloat<R32G32B32A32_UINT>(pDst, srcPixel); break;
-+    case R32G32B32X32_FLOAT: ConvertPixelFromFloat<R32G32B32X32_FLOAT>(pDst, srcPixel); break;
-+    case R32G32B32A32_SSCALED: ConvertPixelFromFloat<R32G32B32A32_SSCALED>(pDst, srcPixel); break;
-+    case R32G32B32A32_USCALED: ConvertPixelFromFloat<R32G32B32A32_USCALED>(pDst, srcPixel); break;
-+    case R32G32B32_FLOAT: ConvertPixelFromFloat<R32G32B32_FLOAT>(pDst, srcPixel); break;
-+    case R32G32B32_SINT: ConvertPixelFromFloat<R32G32B32_SINT>(pDst, srcPixel); break;
-+    case R32G32B32_UINT: ConvertPixelFromFloat<R32G32B32_UINT>(pDst, srcPixel); break;
-+    case R32G32B32_SSCALED: ConvertPixelFromFloat<R32G32B32_SSCALED>(pDst, srcPixel); break;
-+    case R32G32B32_USCALED: ConvertPixelFromFloat<R32G32B32_USCALED>(pDst, srcPixel); break;
-+    case R16G16B16A16_UNORM: ConvertPixelFromFloat<R16G16B16A16_UNORM>(pDst, srcPixel); break;
-+    case R16G16B16A16_SNORM: ConvertPixelFromFloat<R16G16B16A16_SNORM>(pDst, srcPixel); break;
-+    case R16G16B16A16_SINT: ConvertPixelFromFloat<R16G16B16A16_SINT>(pDst, srcPixel); break;
-+    case R16G16B16A16_UINT: ConvertPixelFromFloat<R16G16B16A16_UINT>(pDst, srcPixel); break;
-+    case R16G16B16A16_FLOAT: ConvertPixelFromFloat<R16G16B16A16_FLOAT>(pDst, srcPixel); break;
-+    case R32G32_FLOAT: ConvertPixelFromFloat<R32G32_FLOAT>(pDst, srcPixel); break;
-+    case R32G32_SINT: ConvertPixelFromFloat<R32G32_SINT>(pDst, srcPixel); break;
-+    case R32G32_UINT: ConvertPixelFromFloat<R32G32_UINT>(pDst, srcPixel); break;
-+    case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS>(pDst, srcPixel); break;
-+    case R16G16B16X16_UNORM: ConvertPixelFromFloat<R16G16B16X16_UNORM>(pDst, srcPixel); break;
-+    case R16G16B16X16_FLOAT: ConvertPixelFromFloat<R16G16B16X16_FLOAT>(pDst, srcPixel); break;
-+    case R16G16B16A16_SSCALED: ConvertPixelFromFloat<R16G16B16A16_SSCALED>(pDst, srcPixel); break;
-+    case R16G16B16A16_USCALED: ConvertPixelFromFloat<R16G16B16A16_USCALED>(pDst, srcPixel); break;
-+    case R32G32_SSCALED: ConvertPixelFromFloat<R32G32_SSCALED>(pDst, srcPixel); break;
-+    case R32G32_USCALED: ConvertPixelFromFloat<R32G32_USCALED>(pDst, srcPixel); break;
-+    case R32_FLOAT_X8X24_TYPELESS_LD: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS_LD>(pDst, srcPixel); break;
-+    case B8G8R8A8_UNORM: ConvertPixelFromFloat<B8G8R8A8_UNORM>(pDst, srcPixel); break;
-+    case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8A8_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R10G10B10A2_UNORM: ConvertPixelFromFloat<R10G10B10A2_UNORM>(pDst, srcPixel); break;
-+    case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat<R10G10B10A2_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R10G10B10A2_UINT: ConvertPixelFromFloat<R10G10B10A2_UINT>(pDst, srcPixel); break;
-+    case R8G8B8A8_UNORM: ConvertPixelFromFloat<R8G8B8A8_UNORM>(pDst, srcPixel); break;
-+    case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8A8_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R8G8B8A8_SNORM: ConvertPixelFromFloat<R8G8B8A8_SNORM>(pDst, srcPixel); break;
-+    case R8G8B8A8_SINT: ConvertPixelFromFloat<R8G8B8A8_SINT>(pDst, srcPixel); break;
-+    case R8G8B8A8_UINT: ConvertPixelFromFloat<R8G8B8A8_UINT>(pDst, srcPixel); break;
-+    case R16G16_UNORM: ConvertPixelFromFloat<R16G16_UNORM>(pDst, srcPixel); break;
-+    case R16G16_SNORM: ConvertPixelFromFloat<R16G16_SNORM>(pDst, srcPixel); break;
-+    case R16G16_SINT: ConvertPixelFromFloat<R16G16_SINT>(pDst, srcPixel); break;
-+    case R16G16_UINT: ConvertPixelFromFloat<R16G16_UINT>(pDst, srcPixel); break;
-+    case R16G16_FLOAT: ConvertPixelFromFloat<R16G16_FLOAT>(pDst, srcPixel); break;
-+    case B10G10R10A2_UNORM: ConvertPixelFromFloat<B10G10R10A2_UNORM>(pDst, srcPixel); break;
-+    case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat<B10G10R10A2_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R11G11B10_FLOAT: ConvertPixelFromFloat<R11G11B10_FLOAT>(pDst, srcPixel); break;
-+    case R32_SINT: ConvertPixelFromFloat<R32_SINT>(pDst, srcPixel); break;
-+    case R32_UINT: ConvertPixelFromFloat<R32_UINT>(pDst, srcPixel); break;
-+    case R32_FLOAT: ConvertPixelFromFloat<R32_FLOAT>(pDst, srcPixel); break;
-+    case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS>(pDst, srcPixel); break;
-+    case R24_UNORM_X8_TYPELESS_LD: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS_LD>(pDst, srcPixel); break;
-+    case A32_FLOAT: ConvertPixelFromFloat<A32_FLOAT>(pDst, srcPixel); break;
-+    case B8G8R8X8_UNORM: ConvertPixelFromFloat<B8G8R8X8_UNORM>(pDst, srcPixel); break;
-+    case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8X8_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R8G8B8X8_UNORM: ConvertPixelFromFloat<R8G8B8X8_UNORM>(pDst, srcPixel); break;
-+    case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8X8_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat<R9G9B9E5_SHAREDEXP>(pDst, srcPixel); break;
-+    case B10G10R10X2_UNORM: ConvertPixelFromFloat<B10G10R10X2_UNORM>(pDst, srcPixel); break;
-+    case R10G10B10X2_USCALED: ConvertPixelFromFloat<R10G10B10X2_USCALED>(pDst, srcPixel); break;
-+    case R8G8B8A8_SSCALED: ConvertPixelFromFloat<R8G8B8A8_SSCALED>(pDst, srcPixel); break;
-+    case R8G8B8A8_USCALED: ConvertPixelFromFloat<R8G8B8A8_USCALED>(pDst, srcPixel); break;
-+    case R16G16_SSCALED: ConvertPixelFromFloat<R16G16_SSCALED>(pDst, srcPixel); break;
-+    case R16G16_USCALED: ConvertPixelFromFloat<R16G16_USCALED>(pDst, srcPixel); break;
-+    case R32_SSCALED: ConvertPixelFromFloat<R32_SSCALED>(pDst, srcPixel); break;
-+    case R32_USCALED: ConvertPixelFromFloat<R32_USCALED>(pDst, srcPixel); break;
-+    case B5G6R5_UNORM: ConvertPixelFromFloat<B5G6R5_UNORM>(pDst, srcPixel); break;
-+    case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat<B5G6R5_UNORM_SRGB>(pDst, srcPixel); break;
-+    case B5G5R5A1_UNORM: ConvertPixelFromFloat<B5G5R5A1_UNORM>(pDst, srcPixel); break;
-+    case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5A1_UNORM_SRGB>(pDst, srcPixel); break;
-+    case B4G4R4A4_UNORM: ConvertPixelFromFloat<B4G4R4A4_UNORM>(pDst, srcPixel); break;
-+    case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat<B4G4R4A4_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R8G8_UNORM: ConvertPixelFromFloat<R8G8_UNORM>(pDst, srcPixel); break;
-+    case R8G8_SNORM: ConvertPixelFromFloat<R8G8_SNORM>(pDst, srcPixel); break;
-+    case R8G8_SINT: ConvertPixelFromFloat<R8G8_SINT>(pDst, srcPixel); break;
-+    case R8G8_UINT: ConvertPixelFromFloat<R8G8_UINT>(pDst, srcPixel); break;
-+    case R16_UNORM: ConvertPixelFromFloat<R16_UNORM>(pDst, srcPixel); break;
-+    case R16_SNORM: ConvertPixelFromFloat<R16_SNORM>(pDst, srcPixel); break;
-+    case R16_SINT: ConvertPixelFromFloat<R16_SINT>(pDst, srcPixel); break;
-+    case R16_UINT: ConvertPixelFromFloat<R16_UINT>(pDst, srcPixel); break;
-+    case R16_FLOAT: ConvertPixelFromFloat<R16_FLOAT>(pDst, srcPixel); break;
-+    case A16_UNORM: ConvertPixelFromFloat<A16_UNORM>(pDst, srcPixel); break;
-+    case A16_FLOAT: ConvertPixelFromFloat<A16_FLOAT>(pDst, srcPixel); break;
-+    case B5G5R5X1_UNORM: ConvertPixelFromFloat<B5G5R5X1_UNORM>(pDst, srcPixel); break;
-+    case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5X1_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R8G8_SSCALED: ConvertPixelFromFloat<R8G8_SSCALED>(pDst, srcPixel); break;
-+    case R8G8_USCALED: ConvertPixelFromFloat<R8G8_USCALED>(pDst, srcPixel); break;
-+    case R16_SSCALED: ConvertPixelFromFloat<R16_SSCALED>(pDst, srcPixel); break;
-+    case R16_USCALED: ConvertPixelFromFloat<R16_USCALED>(pDst, srcPixel); break;
-+    case R8_UNORM: ConvertPixelFromFloat<R8_UNORM>(pDst, srcPixel); break;
-+    case R8_SNORM: ConvertPixelFromFloat<R8_SNORM>(pDst, srcPixel); break;
-+    case R8_SINT: ConvertPixelFromFloat<R8_SINT>(pDst, srcPixel); break;
-+    case R8_UINT: ConvertPixelFromFloat<R8_UINT>(pDst, srcPixel); break;
-+    case A8_UNORM: ConvertPixelFromFloat<A8_UNORM>(pDst, srcPixel); break;
-+    case R8_SSCALED: ConvertPixelFromFloat<R8_SSCALED>(pDst, srcPixel); break;
-+    case R8_USCALED: ConvertPixelFromFloat<R8_USCALED>(pDst, srcPixel); break;
-+    case YCRCB_SWAPUVY: ConvertPixelFromFloat<YCRCB_SWAPUVY>(pDst, srcPixel); break;
-+    case BC1_UNORM: ConvertPixelFromFloat<BC1_UNORM>(pDst, srcPixel); break;
-+    case BC2_UNORM: ConvertPixelFromFloat<BC2_UNORM>(pDst, srcPixel); break;
-+    case BC3_UNORM: ConvertPixelFromFloat<BC3_UNORM>(pDst, srcPixel); break;
-+    case BC4_UNORM: ConvertPixelFromFloat<BC4_UNORM>(pDst, srcPixel); break;
-+    case BC5_UNORM: ConvertPixelFromFloat<BC5_UNORM>(pDst, srcPixel); break;
-+    case BC1_UNORM_SRGB: ConvertPixelFromFloat<BC1_UNORM_SRGB>(pDst, srcPixel); break;
-+    case BC2_UNORM_SRGB: ConvertPixelFromFloat<BC2_UNORM_SRGB>(pDst, srcPixel); break;
-+    case BC3_UNORM_SRGB: ConvertPixelFromFloat<BC3_UNORM_SRGB>(pDst, srcPixel); break;
-+    case YCRCB_SWAPUV: ConvertPixelFromFloat<YCRCB_SWAPUV>(pDst, srcPixel); break;
-+    case R8G8B8_UNORM: ConvertPixelFromFloat<R8G8B8_UNORM>(pDst, srcPixel); break;
-+    case R8G8B8_SNORM: ConvertPixelFromFloat<R8G8B8_SNORM>(pDst, srcPixel); break;
-+    case R8G8B8_SSCALED: ConvertPixelFromFloat<R8G8B8_SSCALED>(pDst, srcPixel); break;
-+    case R8G8B8_USCALED: ConvertPixelFromFloat<R8G8B8_USCALED>(pDst, srcPixel); break;
-+    case BC4_SNORM: ConvertPixelFromFloat<BC4_SNORM>(pDst, srcPixel); break;
-+    case BC5_SNORM: ConvertPixelFromFloat<BC5_SNORM>(pDst, srcPixel); break;
-+    case R16G16B16_FLOAT: ConvertPixelFromFloat<R16G16B16_FLOAT>(pDst, srcPixel); break;
-+    case R16G16B16_UNORM: ConvertPixelFromFloat<R16G16B16_UNORM>(pDst, srcPixel); break;
-+    case R16G16B16_SNORM: ConvertPixelFromFloat<R16G16B16_SNORM>(pDst, srcPixel); break;
-+    case R16G16B16_SSCALED: ConvertPixelFromFloat<R16G16B16_SSCALED>(pDst, srcPixel); break;
-+    case R16G16B16_USCALED: ConvertPixelFromFloat<R16G16B16_USCALED>(pDst, srcPixel); break;
-+    case BC7_UNORM: ConvertPixelFromFloat<BC7_UNORM>(pDst, srcPixel); break;
-+    case BC7_UNORM_SRGB: ConvertPixelFromFloat<BC7_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8_UNORM_SRGB>(pDst, srcPixel); break;
-+    case R16G16B16_UINT: ConvertPixelFromFloat<R16G16B16_UINT>(pDst, srcPixel); break;
-+    case R16G16B16_SINT: ConvertPixelFromFloat<R16G16B16_SINT>(pDst, srcPixel); break;
-+    case R10G10B10A2_SNORM: ConvertPixelFromFloat<R10G10B10A2_SNORM>(pDst, srcPixel); break;
-+    case R10G10B10A2_USCALED: ConvertPixelFromFloat<R10G10B10A2_USCALED>(pDst, srcPixel); break;
-+    case R10G10B10A2_SSCALED: ConvertPixelFromFloat<R10G10B10A2_SSCALED>(pDst, srcPixel); break;
-+    case R10G10B10A2_SINT: ConvertPixelFromFloat<R10G10B10A2_SINT>(pDst, srcPixel); break;
-+    case B10G10R10A2_SNORM: ConvertPixelFromFloat<B10G10R10A2_SNORM>(pDst, srcPixel); break;
-+    case B10G10R10A2_USCALED: ConvertPixelFromFloat<B10G10R10A2_USCALED>(pDst, srcPixel); break;
-+    case B10G10R10A2_SSCALED: ConvertPixelFromFloat<B10G10R10A2_SSCALED>(pDst, srcPixel); break;
-+    case B10G10R10A2_UINT: ConvertPixelFromFloat<B10G10R10A2_UINT>(pDst, srcPixel); break;
-+    case B10G10R10A2_SINT: ConvertPixelFromFloat<B10G10R10A2_SINT>(pDst, srcPixel); break;
-+    case R8G8B8_UINT: ConvertPixelFromFloat<R8G8B8_UINT>(pDst, srcPixel); break;
-+    case R8G8B8_SINT: ConvertPixelFromFloat<R8G8B8_SINT>(pDst, srcPixel); break;
-+    default:
-+        break;
-+    }
-+}
-+
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
-new file mode 100644
-index 0000000..49893e8
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
-@@ -0,0 +1,382 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file LoadTile.cpp
-+* 
-+* @brief Functionality for Load
-+* 
-+******************************************************************************/
-+#include "common/os.h"
-+#include "common/formats.h"
-+#include "core/context.h"
-+#include "core/rdtsc_core.h"
-+#include "memory/TilingFunctions.h"
-+#include "memory/tilingtraits.h"
-+#include "memory/Convert.h"
-+
-+typedef void(*PFN_LOAD_TILES)(SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Load Raster Tile Function Tables.
-+//////////////////////////////////////////////////////////////////////////
-+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
-+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
-+
-+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
-+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS];
-+
-+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// LoadRasterTile
-+//////////////////////////////////////////////////////////////////////////
-+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct LoadRasterTile
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Retrieve color from hot tile source which is always float.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param x, y - Coordinates to raster tile.
-+    /// @param output - output color
-+    INLINE static void SetSwizzledDstColor(
-+        const float srcColor[4],
-+        uint32_t x, uint32_t y,
-+        uint8_t* pDst)
-+    {
-+        typedef SimdTile<DstFormat, SrcFormat> SimdT;
-+
-+        SimdT* pDstSimdTiles = (SimdT*)pDst;
-+
-+        // Compute which simd tile we're accessing within 8x8 tile.
-+        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
-+        uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
-+
-+        SimdT* pSimdTile = &pDstSimdTiles[simdIndex];
-+
-+        uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
-+
-+        pSimdTile->SetSwizzledColor(simdOffset, srcColor);
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Loads an 8x8 raster tile from the src surface.
-+    /// @param pSrcSurface - Src surface state
-+    /// @param pDst - Destination hot tile pointer
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Load(
-+        SWR_SURFACE_STATE* pSrcSurface,
-+        uint8_t* pDst,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
-+    {
-+        uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod;
-+        uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod;
-+
-+        // For each raster tile pixel (rx, ry)
-+        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
-+        {
-+            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
-+            {
-+                if (((x + rx) < lodWidth) &&
-+                    ((y + ry) < lodHeight))
-+                {
-+                    uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress<false>(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex,
-+                                                                           pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
-+                                                                           pSrcSurface->lod, pSrcSurface);
-+
-+                    float srcColor[4];
-+                    ConvertPixelToFloat<SrcFormat>(srcColor, pSrc);
-+
-+                    // store pixel to hottile
-+                    SetSwizzledDstColor(srcColor, rx, ry, pDst);
-+                }
-+            }
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// LoadMacroTile - Loads a macro tile which consists of raster tiles.
-+//////////////////////////////////////////////////////////////////////////
-+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct LoadMacroTile
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Load a macrotile to the destination surface.
-+    /// @param pSrc - Pointer to macro tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to macro tile
-+    static void Load(
-+        SWR_SURFACE_STATE* pSrcSurface,
-+        uint8_t *pDstHotTile,
-+        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
-+    {
-+        // Load each raster tile from the hot tile to the destination surface.
-+        for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-+        {
-+            for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-+            {
-+                for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++)
-+                {
-+                    LoadRasterTile<TTraits, SrcFormat, DstFormat>::Load(pSrcSurface, pDstHotTile, 
-+                        (x + col), (y + row), sampleNum, renderTargetArrayIndex);
-+                    pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<DstFormat>::bpp / 8);
-+                }
-+            }
-+        }
-+    }
-+};
-+
-+static void BUCKETS_START(UINT id)
-+{
-+#ifdef KNOB_ENABLE_RDTSC
-+    gBucketMgr.StartBucket(id);
-+#endif
-+}
-+
-+static void BUCKETS_STOP(UINT id)
-+{
-+#ifdef KNOB_ENABLE_RDTSC
-+    gBucketMgr.StopBucket(id);
-+#endif
-+}
-+
-+// on demand buckets for load tiles
-+static std::vector<int> sBuckets(NUM_SWR_FORMATS, -1);
-+static std::mutex sBucketMutex;
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Loads a full hottile from a render surface
-+/// @param hPrivateContext - Handle to private DC
-+/// @param dstFormat - Format for hot tile.
-+/// @param renderTargetIndex - Index to src render target
-+/// @param x, y - Coordinates to raster tile.
-+/// @param pDstHotTile - Pointer to Hot Tile
-+void LoadHotTile(
-+    SWR_SURFACE_STATE *pSrcSurface,
-+    SWR_FORMAT dstFormat,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-+    uint8_t *pDstHotTile)
-+{
-+    PFN_LOAD_TILES pfnLoadTiles = NULL;
-+
-+    // don't need to load null surfaces
-+    if (pSrcSurface->type == SURFACE_NULL)
-+    {
-+        return;
-+    }
-+    
-+    if (renderTargetIndex < SWR_ATTACHMENT_DEPTH)
-+    {
-+        switch (pSrcSurface->tileMode)
-+        {
-+        case SWR_TILE_NONE:
-+            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format];
-+            break;
-+        case SWR_TILE_MODE_YMAJOR:
-+            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
-+            break;
-+        case SWR_TILE_MODE_XMAJOR:
-+            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format];
-+            break;
-+        default:
-+            SWR_ASSERT(0, "Unsupported tiling mode");
-+            break;
-+        }
-+    }
-+    else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
-+    {
-+        // Currently depth can map to linear and tile-y.
-+        switch (pSrcSurface->tileMode)
-+        {
-+        case SWR_TILE_NONE:
-+            pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format];
-+            break;
-+        case SWR_TILE_MODE_YMAJOR:
-+            pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
-+            break;
-+        default:
-+            SWR_ASSERT(0, "Unsupported tiling mode");
-+            break;
-+        }
-+    }
-+    else
-+    {
-+        SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL);
-+        SWR_ASSERT(pSrcSurface->format == R8_UINT);
-+        switch (pSrcSurface->tileMode)
-+        {
-+        case SWR_TILE_NONE:
-+            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_NONE, 8>, R8_UINT, R8_UINT>::Load;
-+            break;
-+        case SWR_TILE_MODE_WMAJOR:
-+            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
-+            break;
-+        default:
-+            SWR_ASSERT(0, "Unsupported tiling mode");
-+            break;
-+        }
-+    }
-+
-+    SWR_ASSERT(pfnLoadTiles != NULL);
-+
-+    // Load a macro tile.
-+#ifdef KNOB_ENABLE_RDTSC
-+    if (sBuckets[pSrcSurface->format] == -1)
-+    {
-+        // guard sBuckets update since storetiles is called by multiple threads
-+        sBucketMutex.lock();
-+        if (sBuckets[pSrcSurface->format] == -1)
-+        {
-+            const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format);
-+            BUCKET_DESC desc{ info.name, "", false, 0xffffffff };
-+            sBuckets[pSrcSurface->format] = gBucketMgr.RegisterBucket(desc);
-+        }
-+        sBucketMutex.unlock();
-+    }
-+#endif
-+
-+    BUCKETS_START(sBuckets[pSrcSurface->format]);
-+    pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex);
-+    BUCKETS_STOP(sBuckets[pSrcSurface->format]);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
-+#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \
-+    memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \
-+    \
-+    sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32B32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32B32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32G32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R32_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R32_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[A32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B5G6R5_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[A16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, A16_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[A16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 16>, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[A8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 8>, A8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC1_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC3_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC4_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC5_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC4_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[BC5_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R16G16B16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \
-+    sLoadTilesColorTable_##tilemode[R8G8B8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
-+#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \
-+    memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \
-+    \
-+    sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32_FLOAT>::Load; \
-+    sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32_FLOAT>::Load; \
-+    sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile<TilingTraits<tilemode, 32>, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Sets up tables for LoadTile
-+void InitSimLoadTilesTable()
-+{
-+    INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE);
-+    INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE);
-+
-+    INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR);
-+    INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR);
-+
-+    INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
-new file mode 100644
-index 0000000..fbd76a3
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
-@@ -0,0 +1,1645 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file StoreTile.cpp
-+* 
-+* @brief Functionality for Store.
-+* 
-+******************************************************************************/
-+#include "common/os.h"
-+#include "common/formats.h"
-+#include "core/context.h"
-+#include "core/rdtsc_core.h"
-+#include "core/format_conversion.h"
-+
-+#include "memory/TilingFunctions.h"
-+#include "memory/tilingtraits.h"
-+#include "memory/Convert.h"
-+#include "core/multisample.h"
-+
-+#include <array>
-+#include <sstream>
-+
-+typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// Store Raster Tile Function Tables.
-+//////////////////////////////////////////////////////////////////////////
-+static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
-+static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
-+static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StorePixels
-+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-+/// @param ppDsts   - Array of destination pointers.  Each pointer is
-+///                   to a single row of at most 16B.
-+/// @tparam NumDests - Number of destination pointers.  Each pair of
-+///                    pointers is for a 16-byte column of two rows.
-+//////////////////////////////////////////////////////////////////////////
-+template <size_t PixelSize, size_t NumDests>
-+struct StorePixels
-+{
-+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StorePixels (32-bit pixel specialization)
-+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-+/// @param ppDsts   - Array of destination pointers.  Each pointer is
-+///                   to a single row of at most 16B.
-+/// @tparam NumDests - Number of destination pointers.  Each pair of
-+///                    pointers is for a 16-byte column of two rows.
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct StorePixels<8, 2>
-+{
-+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
-+    {
-+        // Each 4-pixel row is 4 bytes.
-+        const uint16_t* pPixSrc = (const uint16_t*)pSrc;
-+
-+        // Unswizzle from SWR-Z order
-+        uint16_t* pRow = (uint16_t*)ppDsts[0];
-+        pRow[0] = pPixSrc[0];
-+        pRow[1] = pPixSrc[2];
-+
-+        pRow = (uint16_t*)ppDsts[1];
-+        pRow[0] = pPixSrc[1];
-+        pRow[1] = pPixSrc[3];
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StorePixels (32-bit pixel specialization)
-+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-+/// @param ppDsts   - Array of destination pointers.  Each pointer is
-+///                   to a single row of at most 16B.
-+/// @tparam NumDests - Number of destination pointers.  Each pair of
-+///                    pointers is for a 16-byte column of two rows.
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct StorePixels<16, 2>
-+{
-+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
-+    {
-+        // Each 4-pixel row is 8 bytes.
-+        const uint32_t* pPixSrc = (const uint32_t*)pSrc;
-+
-+        // Unswizzle from SWR-Z order
-+        uint32_t* pRow = (uint32_t*)ppDsts[0];
-+        pRow[0] = pPixSrc[0];
-+        pRow[1] = pPixSrc[2];
-+
-+        pRow = (uint32_t*)ppDsts[1];
-+        pRow[0] = pPixSrc[1];
-+        pRow[1] = pPixSrc[3];
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StorePixels (32-bit pixel specialization)
-+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-+/// @param ppDsts   - Array of destination pointers.  Each pointer is
-+///                   to a single row of at most 16B.
-+/// @tparam NumDests - Number of destination pointers.  Each pair of
-+///                    pointers is for a 16-byte column of two rows.
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct StorePixels<32, 2>
-+{
-+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
-+    {
-+        // Each 4-pixel row is 16-bytes
-+        __m128i *pZRow01 = (__m128i*)pSrc;
-+        __m128i vQuad00 = _mm_load_si128(pZRow01);
-+        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
-+
-+        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-+        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
-+
-+        _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
-+        _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StorePixels (32-bit pixel specialization)
-+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-+/// @param ppDsts   - Array of destination pointers.  Each pointer is
-+///                   to a single row of at most 16B.
-+/// @tparam NumDests - Number of destination pointers.  Each pair of
-+///                    pointers is for a 16-byte column of two rows.
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct StorePixels<64, 4>
-+{
-+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
-+    {
-+        // Each 4-pixel row is 32 bytes.
-+        const __m128i* pPixSrc = (const __m128i*)pSrc;
-+
-+        // order of pointers match SWR-Z layout
-+        __m128i** pvDsts = (__m128i**)&ppDsts[0];
-+        *pvDsts[0] = pPixSrc[0];
-+        *pvDsts[1] = pPixSrc[1];
-+        *pvDsts[2] = pPixSrc[2];
-+        *pvDsts[3] = pPixSrc[3];
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StorePixels (32-bit pixel specialization)
-+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
-+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
-+/// @param ppDsts   - Array of destination pointers.  Each pointer is
-+///                   to a single row of at most 16B.
-+/// @tparam NumDests - Number of destination pointers.  Each pair of
-+///                    pointers is for a 16-byte column of two rows.
-+//////////////////////////////////////////////////////////////////////////
-+template <>
-+struct StorePixels<128, 8>
-+{
-+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
-+    {
-+        // Each 4-pixel row is 64 bytes.
-+        const __m128i* pPixSrc = (const __m128i*)pSrc;
-+
-+        // Unswizzle from SWR-Z order
-+        __m128i** pvDsts = (__m128i**)&ppDsts[0];
-+        *pvDsts[0] = pPixSrc[0];
-+        *pvDsts[1] = pPixSrc[2];
-+        *pvDsts[2] = pPixSrc[1];
-+        *pvDsts[3] = pPixSrc[3];
-+        *pvDsts[4] = pPixSrc[4];
-+        *pvDsts[5] = pPixSrc[6];
-+        *pvDsts[6] = pPixSrc[5];
-+        *pvDsts[7] = pPixSrc[7];
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct ConvertPixelsSOAtoAOS
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Converts a SIMD from the Hot Tile to the destination format
-+    ///        and converts from SOA to AOS.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDst - Pointer to destination surface or deswizzling buffer.
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
-+
-+        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
-+        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-+
-+        // Convert from SrcFormat --> DstFormat
-+        simdvector src;
-+        LoadSOA<SrcFormat>(pSrc, src);
-+        StoreSOA<DstFormat>(src, soaTile);
-+
-+        // Convert from SOA --> AOS
-+        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
-+
-+        // Store data into destination
-+        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
-+/// Specialization for no format conversion
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT Format>
-+struct ConvertPixelsSOAtoAOS<Format, Format>
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Converts a SIMD from the Hot Tile to the destination format
-+    ///        and converts from SOA to AOS.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDst - Pointer to destination surface or deswizzling buffer.
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
-+
-+        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-+
-+        // Convert from SOA --> AOS
-+        FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
-+
-+        // Store data into destination
-+        StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
-+//////////////////////////////////////////////////////////////////////////
-+template<>
-+struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
-+{
-+    static const SWR_FORMAT SrcFormat = R32_FLOAT;
-+    static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Converts a SIMD from the Hot Tile to the destination format
-+    ///        and converts from SOA to AOS.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDst - Pointer to destination surface or deswizzling buffer.
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
-+
-+        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
-+        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
-+
-+        // Convert from SrcFormat --> DstFormat
-+        simdvector src;
-+        LoadSOA<SrcFormat>(pSrc, src);
-+        StoreSOA<DstFormat>(src, soaTile);
-+
-+        // Convert from SOA --> AOS
-+        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
-+
-+        // Store data into destination but don't overwrite the X8 bits
-+        // Each 4-pixel row is 16-bytes
-+        __m128i *pZRow01 = (__m128i*)aosTile;
-+        __m128i vQuad00 = _mm_load_si128(pZRow01);
-+        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
-+
-+        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-+        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
-+
-+        __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
-+        __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
-+
-+        __m128i vMask = _mm_set1_epi32(0xFFFFFF);
-+
-+        vDst0 = _mm_andnot_si128(vMask, vDst0);
-+        vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
-+        vDst1 = _mm_andnot_si128(vMask, vDst1);
-+        vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
-+
-+        _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
-+        _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
-+    }
-+};
-+
-+template<SWR_FORMAT DstFormat>
-+INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
-+{
-+    static const uint32_t offset = sizeof(simdscalar);
-+
-+    // swizzle rgba -> bgra while we load
-+    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 
-+    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
-+    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 
-+    simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa 
-+
-+    // clamp
-+    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
-+    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
-+
-+    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
-+    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
-+
-+    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
-+    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
-+
-+    vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
-+    vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
-+
-+    if (FormatTraits<DstFormat>::isSRGB)
-+    {
-+        // Gamma-correct only rgb
-+        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
-+        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
-+        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
-+    }
-+
-+    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
-+    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 
-+    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
-+    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
-+    vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
-+
-+    // moving to 8 wide integer vector types
-+    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
-+    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 
-+    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 
-+    __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
-+
-+#if KNOB_ARCH == KNOB_ARCH_AVX
-+
-+    // splitting into two sets of 4 wide integer vector types
-+    // because AVX doesn't have instructions to support this operation at 8 wide
-+    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-+    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-+    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
-+    __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
-+
-+    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-+    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-+    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
-+    __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
-+
-+    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
-+    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
-+    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
-+    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
-+    srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
-+    srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
-+
-+    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
-+    srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
-+
-+    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
-+    srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
-+
-+    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
-+    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
-+
-+    // unpack into rows that get the tiling order correct
-+    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
-+    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
-+
-+    __m256i final = _mm256_castsi128_si256(vRow00);
-+    final = _mm256_insertf128_si256(final, vRow10, 1);
-+
-+#elif KNOB_ARCH == KNOB_ARCH_AVX2
-+
-+    // logic is as above, only wider
-+    src1 = _mm256_slli_si256(src1, 1);
-+    src2 = _mm256_slli_si256(src2, 2);
-+    src3 = _mm256_slli_si256(src3, 3);
-+
-+    src0 = _mm256_or_si256(src0, src1);
-+    src2 = _mm256_or_si256(src2, src3);
-+
-+    __m256i final = _mm256_or_si256(src0, src2);
-+        
-+    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
-+    final = _mm256_permute4x64_epi64(final, 0xD8);
-+
-+#endif
-+
-+    _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
-+}
-+
-+template<SWR_FORMAT DstFormat>
-+INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
-+{
-+    static const uint32_t offset = sizeof(simdscalar);
-+
-+    // swizzle rgba -> bgra while we load
-+    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 
-+    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
-+    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 
-+                                                                                                            // clamp
-+    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
-+    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
-+
-+    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
-+    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
-+
-+    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
-+    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
-+
-+    if (FormatTraits<DstFormat>::isSRGB)
-+    {
-+        // Gamma-correct only rgb
-+        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
-+        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
-+        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
-+    }
-+
-+    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
-+    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
-+    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
-+    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
-+
-+    // moving to 8 wide integer vector types
-+    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
-+    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 
-+    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 
-+
-+#if KNOB_ARCH == KNOB_ARCH_AVX
-+
-+                                              // splitting into two sets of 4 wide integer vector types
-+                                              // because AVX doesn't have instructions to support this operation at 8 wide
-+    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-+    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-+    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
-+
-+    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-+    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-+    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
-+
-+    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
-+    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
-+    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
-+    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
-+
-+    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
-+
-+    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
-+
-+    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
-+    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
-+
-+                                           // unpack into rows that get the tiling order correct
-+    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
-+    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
-+
-+    __m256i final = _mm256_castsi128_si256(vRow00);
-+    final = _mm256_insertf128_si256(final, vRow10, 1);
-+
-+#elif KNOB_ARCH == KNOB_ARCH_AVX2
-+
-+                                              // logic is as above, only wider
-+    src1 = _mm256_slli_si256(src1, 1);
-+    src2 = _mm256_slli_si256(src2, 2);
-+
-+    src0 = _mm256_or_si256(src0, src1);
-+
-+    __m256i final = _mm256_or_si256(src0, src2);
-+
-+    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
-+    final = _mm256_permute4x64_epi64(final, 0xD8);
-+
-+#endif
-+
-+    _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
-+}
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+template<>
-+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
-+{
-+    template <size_t NumDests>
-+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
-+    {
-+        FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StoreRasterTile
-+//////////////////////////////////////////////////////////////////////////
-+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct StoreRasterTile
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Retrieve color from hot tile source which is always float.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param x, y - Coordinates to raster tile.
-+    /// @param output - output color
-+    INLINE static void GetSwizzledSrcColor(
-+        uint8_t* pSrc,
-+        uint32_t x, uint32_t y,
-+        float outputColor[4])
-+    {
-+        typedef SimdTile<SrcFormat, DstFormat> SimdT;
-+
-+        SimdT* pSrcSimdTiles = (SimdT*)pSrc;
-+
-+        // Compute which simd tile we're accessing within 8x8 tile.
-+        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
-+        uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
-+
-+        SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
-+
-+        uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
-+
-+        pSimdTile->GetSwizzledColor(simdOffset, outputColor);
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
-+    {
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+
-+        // For each raster tile pixel (rx, ry)
-+        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
-+        {
-+            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
-+            {
-+                // Perform bounds checking.
-+                if (((x + rx) < lodWidth) &&
-+                    ((y + ry) < lodHeight))
-+                {
-+                    float srcColor[4];
-+                    GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
-+
-+                    uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false>((x + rx), (y + ry), 
-+                        pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, 
-+                        sampleNum, pDstSurface->lod, pDstSurface);
-+                    ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
-+                }
-+            }
-+        }
-+    }
-+};
-+
-+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
-+{};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
-+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
-+
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
-+        {
-+            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
-+
-+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
-+            {
-+                // Format conversion and convert from SOA to AOS, and store the rows.
-+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-+
-+                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
-+                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
-+                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
-+            }
-+
-+            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
-+            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
-+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
-+
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
-+        {
-+            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
-+
-+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
-+            {
-+                // Format conversion and convert from SOA to AOS, and store the rows.
-+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-+
-+                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
-+                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
-+                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
-+            }
-+
-+            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
-+            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
-+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
-+
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
-+        {
-+            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
-+
-+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
-+            {
-+                // Format conversion and convert from SOA to AOS, and store the rows.
-+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-+
-+                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
-+                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
-+                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
-+            }
-+
-+            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
-+            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
-+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-+    static const size_t MAX_DST_COLUMN_BYTES = 16;
-+    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
-+    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        uint8_t* ppDsts[] =
-+        {
-+            pDst,                                               // row 0, col 0
-+            pDst + pDstSurface->pitch,                          // row 1, col 0
-+            pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
-+            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
-+        };
-+
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
-+        {
-+            uint8_t* ppStartRows[] =
-+            {
-+                ppDsts[0],
-+                ppDsts[1],
-+                ppDsts[2],
-+                ppDsts[3],
-+            };
-+
-+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
-+            {
-+                // Format conversion and convert from SOA to AOS, and store the rows.
-+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+
-+                ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
-+                ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
-+                ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
-+                ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
-+                pSrc += SRC_COLUMN_BYTES;
-+            }
-+
-+            ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
-+            ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
-+            ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
-+            ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
-+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
-+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
-+    static const size_t MAX_DST_COLUMN_BYTES = 16;
-+    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
-+    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        struct DstPtrs
-+        {
-+            uint8_t* ppDsts[8];
-+        } ptrs;
-+
-+        // Need 8 pointers, 4 columns of 2 rows each
-+        for (uint32_t y = 0; y < 2; ++y)
-+        {
-+            for (uint32_t x = 0; x < 4; ++x)
-+            {
-+                ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
-+            }
-+        }
-+
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
-+        {
-+            DstPtrs startPtrs = ptrs;
-+
-+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
-+            {
-+                // Format conversion and convert from SOA to AOS, and store the rows.
-+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
-+
-+                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
-+                pSrc += SRC_COLUMN_BYTES;
-+            }
-+
-+            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
-+            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
-+            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
-+            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
-+            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
-+            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
-+            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
-+            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-+
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-+        // We can compute the offsets to each column within the raster tile once and increment from these.
-+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
-+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+
-+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
-+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-+
-+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
-+        {
-+            uint32_t rowOffset = row * DestRowWidthBytes;
-+
-+            uint8_t* pRow = pCol0 + rowOffset;
-+            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+
-+            ppDsts[0] += DestRowWidthBytes / 4;
-+            ppDsts[1] += DestRowWidthBytes / 4;
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-+
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-+        // We can compute the offsets to each column within the raster tile once and increment from these.
-+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
-+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+
-+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
-+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-+
-+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
-+        {
-+            uint32_t rowOffset = row * DestRowWidthBytes;
-+
-+            uint8_t* pRow = pCol0 + rowOffset;
-+            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+
-+            ppDsts[0] += DestRowWidthBytes / 2;
-+            ppDsts[1] += DestRowWidthBytes / 2;
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
-+
-+                                                                     // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
-+        // We can compute the offsets to each column within the raster tile once and increment from these.
-+        uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
-+
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
-+        {
-+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
-+            {
-+                uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
-+
-+                uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
-+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+
-+                // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
-+                pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-+            }
-+
-+            pRow0 += (DestRowWidthBytes * 2);
-+            pRow1 += (DestRowWidthBytes * 2);
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-+        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
-+
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-+        // We can compute the offsets to each column within the raster tile once and increment from these.
-+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
-+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+
-+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
-+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-+
-+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
-+        {
-+            uint32_t rowOffset = row * DestRowWidthBytes;
-+
-+            uint8_t* pRow = pCol0 + rowOffset;
-+            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+
-+            ppDsts[0] += DestColumnBytes;
-+            ppDsts[1] += DestColumnBytes;
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
-+        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
-+
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
-+        // We can compute the offsets to each column within the raster tile once and increment from these.
-+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
-+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        uint8_t* pCol1 = pCol0 + DestColumnBytes;
-+
-+        // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
-+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
-+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
-+
-+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
-+        {
-+            uint32_t rowOffset = row * DestRowWidthBytes;
-+            uint8_t* ppDsts[] =
-+            {
-+                pCol0 + rowOffset,
-+                pCol0 + rowOffset + DestRowWidthBytes,
-+                pCol1 + rowOffset,
-+                pCol1 + rowOffset + DestRowWidthBytes,
-+            };
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+
-+            ppDsts[0] += DestColumnBytes * 2;
-+            ppDsts[1] += DestColumnBytes * 2;
-+            ppDsts[2] += DestColumnBytes * 2;
-+            ppDsts[3] += DestColumnBytes * 2;
-+
-+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
-+            pSrc += pSrcInc;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat >
-+{
-+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
-+ 
-+    static const size_t TILE_Y_COL_WIDTH_BYTES  = 16;
-+    static const size_t TILE_Y_ROWS             = 32;
-+    static const size_t TILE_Y_COL_BYTES        = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
-+    
-+    static const size_t DST_BYTES_PER_PIXEL     = FormatTraits<DstFormat>::bpp / 8;
-+    static const size_t SRC_BYTES_PER_PIXEL     = FormatTraits<SrcFormat>::bpp / 8;
-+    static const size_t MAX_DST_COLUMN_BYTES    = 16;
-+
-+    static const size_t SRC_COLUMN_BYTES        = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
-+    static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores an 8x8 raster tile to the destination surface.
-+    /// @param pSrc - Pointer to raster tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to raster tile.
-+    INLINE static void Store(
-+        uint8_t *pSrc,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
-+    {
-+        // Punt non-full tiles to generic store
-+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
-+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
-+        if (x + KNOB_TILE_X_DIM > lodWidth ||
-+            y + KNOB_TILE_Y_DIM > lodHeight)
-+        {
-+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
-+        }
-+
-+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
-+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-+        struct DstPtrs
-+        {
-+            uint8_t* ppDsts[8];
-+        } ptrs;
-+
-+        // Need 8 pointers, 4 columns of 2 rows each
-+        for (uint32_t y = 0; y < 2; ++y)
-+        {
-+            for (uint32_t x = 0; x < 4; ++x)
-+            {
-+                ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
-+            }
-+        }
-+
-+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
-+        {
-+            DstPtrs startPtrs = ptrs;
-+
-+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
-+            {
-+                // Format conversion and convert from SOA to AOS, and store the rows.
-+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
-+
-+                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
-+                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
-+                pSrc += SRC_COLUMN_BYTES;
-+            }
-+
-+            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// StoreMacroTile - Stores a macro tile which consists of raster tiles.
-+//////////////////////////////////////////////////////////////////////////
-+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
-+struct StoreMacroTile
-+{
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores a macrotile to the destination surface using safe implementation.
-+    /// @param pSrc - Pointer to macro tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to macro tile
-+    static void StoreGeneric(
-+        uint8_t *pSrcHotTile,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
-+    {
-+        // Store each raster tile from the hot tile to the destination surface.
-+        for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-+        {
-+            for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-+            {
-+                for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
-+                {
-+                    StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, 
-+                        renderTargetArrayIndex);
-+                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
-+                }
-+            }
-+        }
-+    }
-+
-+    typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Stores a macrotile to the destination surface.
-+    /// @param pSrc - Pointer to macro tile.
-+    /// @param pDstSurface - Destination surface state
-+    /// @param x, y - Coordinates to macro tile
-+    static void Store(
-+        uint8_t *pSrcHotTile,
-+        SWR_SURFACE_STATE* pDstSurface,
-+        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
-+    {
-+        PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
-+        for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
-+        {
-+            size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false>(
-+                0,
-+                0,
-+                pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
-+                pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
-+                sampleNum,
-+                pDstSurface->lod,
-+                pDstSurface);
-+
-+            // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
-+            bool bForceGeneric = (pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff));
-+
-+            pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
-+        }
-+
-+        // Store each raster tile from the hot tile to the destination surface.
-+        for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-+        {
-+            for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-+            {
-+                for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
-+                {
-+                    pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
-+                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
-+                }
-+            }
-+        }
-+    }
-+};
-+
-+static void BUCKETS_START(UINT id)
-+{
-+#ifdef KNOB_ENABLE_RDTSC
-+    gBucketMgr.StartBucket(id);
-+#endif
-+}
-+
-+static void BUCKETS_STOP(UINT id)
-+{
-+#ifdef KNOB_ENABLE_RDTSC
-+    gBucketMgr.StopBucket(id);
-+#endif
-+}
-+
-+// on demand buckets for store tiles
-+static std::mutex sBucketMutex;
-+static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1);
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Deswizzles and stores a full hottile to a render surface
-+/// @param hPrivateContext - Handle to private DC
-+/// @param srcFormat - Format for hot tile.
-+/// @param renderTargetIndex - Index to destination render target
-+/// @param x, y - Coordinates to raster tile.
-+/// @param pSrcHotTile - Pointer to Hot Tile
-+void StoreHotTile(
-+    SWR_SURFACE_STATE *pDstSurface,
-+    SWR_FORMAT srcFormat,
-+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-+    uint8_t *pSrcHotTile)
-+{
-+    // shouldn't ever see a null surface come through StoreTiles
-+    SWR_ASSERT(pDstSurface->type != SURFACE_NULL);
-+
-+    PFN_STORE_TILES pfnStoreTiles = nullptr;
-+    if(renderTargetIndex <= SWR_ATTACHMENT_COLOR7)
-+    {
-+        pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format];
-+    }
-+    else if(renderTargetIndex == SWR_ATTACHMENT_DEPTH)
-+    {
-+        pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format];
-+    }
-+    else
-+    {
-+        pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format];
-+    }
-+
-+    if(nullptr == pfnStoreTiles)
-+    {
-+        SWR_ASSERT(false, "Invalid pixel format / tile mode for store tiles");
-+    }
-+
-+    // Store a macro tile
-+#ifdef KNOB_ENABLE_RDTSC
-+    if (sBuckets[pDstSurface->format] == -1)
-+    {
-+        // guard sBuckets update since storetiles is called by multiple threads
-+        sBucketMutex.lock();
-+        if (sBuckets[pDstSurface->format] == -1)
-+        {
-+            const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format);
-+            BUCKET_DESC desc{info.name, "", false, 0xffffffff};
-+            sBuckets[pDstSurface->format] = gBucketMgr.RegisterBucket(desc);
-+        }
-+        sBucketMutex.unlock();
-+    }
-+#endif
-+
-+    BUCKETS_START(sBuckets[pDstSurface->format]);
-+    pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex);
-+    BUCKETS_STOP(sBuckets[pDstSurface->format]);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// InitStoreTilesTable - Helper for setting up the tables.
-+template <SWR_TILE_MODE TileModeT, size_t NumTileModesT, size_t ArraySizeT>
-+void InitStoreTilesTableColor(
-+    PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
-+{
-+    table[TileModeT][R32G32B32A32_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
-+    table[TileModeT][R32G32B32A32_SINT]         = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
-+    table[TileModeT][R32G32B32A32_UINT]         = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
-+    table[TileModeT][R32G32B32X32_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
-+    table[TileModeT][R32G32B32_FLOAT]           = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
-+    table[TileModeT][R32G32B32_SINT]            = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
-+    table[TileModeT][R32G32B32_UINT]            = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
-+    table[TileModeT][R16G16B16A16_UNORM]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
-+    table[TileModeT][R16G16B16A16_SNORM]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
-+    table[TileModeT][R16G16B16A16_SINT]         = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
-+    table[TileModeT][R16G16B16A16_UINT]         = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
-+    table[TileModeT][R16G16B16A16_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
-+    table[TileModeT][R32G32_FLOAT]              = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
-+    table[TileModeT][R32G32_SINT]               = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
-+    table[TileModeT][R32G32_UINT]               = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
-+    table[TileModeT][R16G16B16X16_UNORM]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
-+    table[TileModeT][R16G16B16X16_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
-+    table[TileModeT][B8G8R8A8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
-+    table[TileModeT][B8G8R8A8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
-+    
-+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
-+    table[TileModeT][R10G10B10A2_UNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
-+    table[TileModeT][R10G10B10A2_UNORM_SRGB]    = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
-+    table[TileModeT][R10G10B10A2_UINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
-+
-+    table[TileModeT][R8G8B8A8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
-+    table[TileModeT][R8G8B8A8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
-+    table[TileModeT][R8G8B8A8_SNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
-+    table[TileModeT][R8G8B8A8_SINT]             = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
-+    table[TileModeT][R8G8B8A8_UINT]             = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
-+    table[TileModeT][R16G16_UNORM]              = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
-+    table[TileModeT][R16G16_SNORM]              = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
-+    table[TileModeT][R16G16_SINT]               = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
-+    table[TileModeT][R16G16_UINT]               = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
-+    table[TileModeT][R16G16_FLOAT]              = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
-+    
-+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
-+    table[TileModeT][B10G10R10A2_UNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
-+    table[TileModeT][B10G10R10A2_UNORM_SRGB]    = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
-+    table[TileModeT][R11G11B10_FLOAT]           = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
-+
-+    table[TileModeT][R32_SINT]                  = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
-+    table[TileModeT][R32_UINT]                  = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
-+    table[TileModeT][R32_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
-+    table[TileModeT][A32_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
-+    table[TileModeT][B8G8R8X8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
-+    table[TileModeT][B8G8R8X8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
-+    table[TileModeT][R8G8B8X8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
-+    table[TileModeT][R8G8B8X8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
-+    
-+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
-+    table[TileModeT][B10G10R10X2_UNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
-+    table[TileModeT][B5G6R5_UNORM]              = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreGeneric;
-+    table[TileModeT][B5G6R5_UNORM_SRGB]         = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
-+    table[TileModeT][B5G5R5A1_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
-+    table[TileModeT][B5G5R5A1_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
-+    table[TileModeT][B4G4R4A4_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
-+    table[TileModeT][B4G4R4A4_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
-+
-+    table[TileModeT][R8G8_UNORM]                = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
-+    table[TileModeT][R8G8_SNORM]                = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
-+    table[TileModeT][R8G8_SINT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
-+    table[TileModeT][R8G8_UINT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
-+    table[TileModeT][R16_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
-+    table[TileModeT][R16_SNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
-+    table[TileModeT][R16_SINT]                  = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
-+    table[TileModeT][R16_UINT]                  = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
-+    table[TileModeT][R16_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
-+    table[TileModeT][A16_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
-+    table[TileModeT][A16_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
-+    
-+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
-+    table[TileModeT][B5G5R5X1_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
-+    table[TileModeT][B5G5R5X1_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
-+
-+    table[TileModeT][R8_UNORM]                  = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
-+    table[TileModeT][R8_SNORM]                  = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
-+    table[TileModeT][R8_SINT]                   = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
-+    table[TileModeT][R8_UINT]                   = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
-+    table[TileModeT][A8_UNORM]                  = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
-+    table[TileModeT][BC1_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM>::Store;
-+    table[TileModeT][BC2_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM>::Store;
-+    table[TileModeT][BC3_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM>::Store;
-+    table[TileModeT][BC4_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_UNORM>::Store;
-+    table[TileModeT][BC5_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_UNORM>::Store;
-+    table[TileModeT][BC1_UNORM_SRGB]            = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store;
-+    table[TileModeT][BC2_UNORM_SRGB]            = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store;
-+    table[TileModeT][BC3_UNORM_SRGB]            = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store;
-+    table[TileModeT][R8G8B8_UNORM]              = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
-+    table[TileModeT][R8G8B8_SNORM]              = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
-+    table[TileModeT][BC4_SNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_SNORM>::Store;
-+    table[TileModeT][BC5_SNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_SNORM>::Store;
-+    table[TileModeT][R16G16B16_FLOAT]           = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
-+    table[TileModeT][R16G16B16_UNORM]           = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
-+    table[TileModeT][R16G16B16_SNORM]           = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
-+    table[TileModeT][R8G8B8_UNORM_SRGB]         = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
-+    table[TileModeT][R16G16B16_UINT]            = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
-+    table[TileModeT][R16G16B16_SINT]            = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
-+
-+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
-+    table[TileModeT][R10G10B10A2_SNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
-+    table[TileModeT][R10G10B10A2_SINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
-+    table[TileModeT][B10G10R10A2_SNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
-+    table[TileModeT][B10G10R10A2_UINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
-+    table[TileModeT][B10G10R10A2_SINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
-+
-+    table[TileModeT][R8G8B8_UINT]               = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
-+    table[TileModeT][R8G8B8_SINT]               = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
-+template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
-+void InitStoreTilesTableDepth(
-+    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
-+{
-+    table[TileModeT][R32_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R32_FLOAT>::Store;
-+    table[TileModeT][R24_UNORM_X8_TYPELESS]     = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
-+    table[TileModeT][R16_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32_FLOAT, R16_UNORM>::Store;
-+}
-+
-+template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
-+void InitStoreTilesTableStencil(
-+    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
-+{
-+    table[TileModeT][R8_UINT]                   = StoreMacroTile<TilingTraits<TileModeT, 8>, R8_UINT, R8_UINT>::Store;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Sets up tables for StoreTile
-+void InitSimStoreTilesTable()
-+{
-+    InitStoreTilesTableColor<SWR_TILE_NONE>(sStoreTilesTableColor);
-+    InitStoreTilesTableDepth<SWR_TILE_NONE>(sStoreTilesTableDepth);
-+    InitStoreTilesTableStencil<SWR_TILE_NONE>(sStoreTilesTableStencil);
-+
-+    InitStoreTilesTableColor<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor);
-+    InitStoreTilesTableColor<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor);
-+
-+    InitStoreTilesTableDepth<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableDepth);
-+    InitStoreTilesTableStencil<SWR_TILE_MODE_WMAJOR>(sStoreTilesTableStencil);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
-new file mode 100644
-index 0000000..78f54f8
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
-@@ -0,0 +1,518 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file TilingFunctions.h
-+* 
-+* @brief Tiling functions.
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+#include "core/state.h"
-+#include "core/format_traits.h"
-+#include "memory/tilingtraits.h"
-+
-+#include <algorithm>
-+
-+#define MAX_NUM_LOD 15
-+
-+#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit.
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?)
-+//////////////////////////////////////////////////////////////////////////
-+template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat>
-+struct SimdTile
-+{
-+    // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
-+    float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD_WIDTH];
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Retrieve color from simd.
-+    /// @param index - linear index to color within simd.
-+    /// @param outputColor - output color
-+    INLINE void GetSwizzledColor(
-+        uint32_t index,
-+        float outputColor[4])
-+    {
-+        // SOA pattern for 2x2 is a subset of 4x2.
-+        //   0 1 4 5
-+        //   2 3 6 7
-+        // The offset converts pattern to linear
-+#if (SIMD_TILE_X_DIM == 4)
-+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-+#elif (SIMD_TILE_X_DIM == 2)
-+        static const uint32_t offset[] = { 0, 1, 2, 3 };
-+#endif
-+
-+        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
-+        {
-+            outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]];
-+        }
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Retrieve color from simd.
-+    /// @param index - linear index to color within simd.
-+    /// @param outputColor - output color
-+    INLINE void SetSwizzledColor(
-+        uint32_t index,
-+        const float src[4])
-+    {
-+        // SOA pattern for 2x2 is a subset of 4x2.
-+        //   0 1 4 5
-+        //   2 3 6 7
-+        // The offset converts pattern to linear
-+#if (SIMD_TILE_X_DIM == 4)
-+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-+#elif (SIMD_TILE_X_DIM == 2)
-+        static const uint32_t offset[] = { 0, 1, 2, 3 };
-+#endif
-+
-+        // Only loop over the components needed for destination.
-+        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
-+        {
-+            this->color[i][offset[index]] = src[i];
-+        }
-+    }
-+};
-+
-+template<>
-+struct SimdTile <R8_UINT,R8_UINT>
-+{
-+    // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
-+    uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD_WIDTH];
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Retrieve color from simd.
-+    /// @param index - linear index to color within simd.
-+    /// @param outputColor - output color
-+    INLINE void GetSwizzledColor(
-+        uint32_t index,
-+        float outputColor[4])
-+    {
-+        // SOA pattern for 2x2 is a subset of 4x2.
-+        //   0 1 4 5
-+        //   2 3 6 7
-+        // The offset converts pattern to linear
-+#if (SIMD_TILE_X_DIM == 4)
-+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-+#elif (SIMD_TILE_X_DIM == 2)
-+        static const uint32_t offset[] = { 0, 1, 2, 3 };
-+#endif
-+
-+        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
-+        {
-+            uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]];
-+            outputColor[i] = *(float*)&src;
-+        }
-+    }
-+
-+    //////////////////////////////////////////////////////////////////////////
-+    /// @brief Retrieve color from simd.
-+    /// @param index - linear index to color within simd.
-+    /// @param outputColor - output color
-+    INLINE void SetSwizzledColor(
-+        uint32_t index,
-+        const float src[4])
-+    {
-+        // SOA pattern for 2x2 is a subset of 4x2.
-+        //   0 1 4 5
-+        //   2 3 6 7
-+        // The offset converts pattern to linear
-+#if (SIMD_TILE_X_DIM == 4)
-+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
-+#elif (SIMD_TILE_X_DIM == 2)
-+        static const uint32_t offset[] = { 0, 1, 2, 3 };
-+#endif
-+
-+        // Only loop over the components needed for destination.
-+        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
-+        {
-+            this->color[i][offset[index]] = *(uint8_t*)&src[i];
-+        }
-+    }
-+};
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes lod offset for 1D surface at specified lod.
-+/// @param baseWidth - width of basemip (mip 0).
-+/// @param hAlign - horizontal alignment per miip, in texels
-+/// @param lod - lod index
-+/// @param offset - output offset.
-+INLINE void ComputeLODOffset1D(
-+    const SWR_FORMAT_INFO& info,
-+    uint32_t baseWidth,
-+    uint32_t hAlign,
-+    uint32_t lod,
-+    uint32_t &offset)
-+{
-+    if (lod == 0)
-+    {
-+        offset = 0;
-+    }
-+    else
-+    {
-+        uint32_t curWidth = baseWidth;
-+        // translate mip width from pixels to blocks for block compressed formats
-+        // @note hAlign is already in blocks for compressed formats so no need to convert
-+        if (info.isBC) curWidth /= info.bcWidth;
-+
-+        offset = GFX_ALIGN(curWidth, hAlign);
-+        for (uint32_t l = 1; l < lod; ++l)
-+        {
-+            curWidth = GFX_ALIGN(std::max<uint32_t>(curWidth >> 1, 1U), hAlign);
-+            offset += curWidth;
-+        }
-+
-+        if (info.isSubsampled)
-+        {
-+            offset /= info.bcWidth;
-+        }
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes x lod offset for 2D surface at specified lod.
-+/// @param baseWidth - width of basemip (mip 0).
-+/// @param hAlign - horizontal alignment per mip, in texels
-+/// @param lod - lod index
-+/// @param offset - output offset.
-+INLINE void ComputeLODOffsetX(
-+    const SWR_FORMAT_INFO& info,
-+    uint32_t baseWidth,
-+    uint32_t hAlign,
-+    uint32_t lod,
-+    uint32_t &offset)
-+{
-+    if (lod < 2)
-+    {
-+        offset = 0;
-+    }
-+    else
-+    {
-+        uint32_t curWidth = baseWidth;
-+        // convert mip width from pixels to blocks for block compressed formats
-+        // @note hAlign is already in blocks for compressed formats so no need to convert
-+        if (info.isBC) curWidth /= info.bcWidth;
-+
-+        curWidth = std::max<uint32_t>(curWidth >> 1, 1U);
-+        curWidth = GFX_ALIGN(curWidth, hAlign);
-+
-+        if (info.isSubsampled)
-+        {
-+            curWidth /= info.bcWidth;
-+        }
-+
-+        offset = curWidth;
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes y lod offset for 2D surface at specified lod.
-+/// @param baseWidth - width of basemip (mip 0).
-+/// @param vAlign - vertical alignment per mip, in rows
-+/// @param lod - lod index
-+/// @param offset - output offset.
-+INLINE void ComputeLODOffsetY(
-+    const SWR_FORMAT_INFO& info,
-+    uint32_t baseHeight,
-+    uint32_t vAlign,
-+    uint32_t lod,
-+    uint32_t &offset)
-+{
-+    if (lod == 0)
-+    {
-+        offset = 0;
-+    }
-+    else
-+    {
-+        offset = 0;
-+        uint32_t mipHeight = baseHeight;
-+
-+        // translate mip height from pixels to blocks for block compressed formats
-+        // @note VAlign is already in blocks for compressed formats so no need to convert
-+        if (info.isBC) mipHeight /= info.bcHeight;
-+
-+        for (uint32_t l = 1; l <= lod; ++l)
-+        {
-+            uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign);
-+            offset += ((l != 2) ? alignedMipHeight : 0);
-+            mipHeight = std::max<uint32_t>(mipHeight >> 1, 1U);
-+        }
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes 1D surface offset
-+/// @param x - offset from start of array slice at given lod.
-+/// @param array - array slice index
-+/// @param lod - lod index
-+/// @param pState - surface state
-+/// @param xOffsetBytes - output offset in bytes.
-+template<bool UseCachedOffsets>
-+INLINE void ComputeSurfaceOffset1D(
-+    uint32_t x,
-+    uint32_t array,
-+    uint32_t lod,
-+    const SWR_SURFACE_STATE *pState,
-+    uint32_t &xOffsetBytes)
-+{
-+    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
-+    uint32_t lodOffset;
-+
-+    if (UseCachedOffsets)
-+    {
-+        lodOffset = pState->lodOffsets[0][lod];
-+    }
-+    else
-+    {
-+        ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset);
-+    }
-+
-+    xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Adjusts the array slice for legacy TileY MSAA
-+/// @param pState - surface state
-+/// @param array - array slice index
-+/// @param sampleNum - requested sample
-+INLINE uint32_t AdjustArrayIndexForMSAA(const SWR_SURFACE_STATE *pState, uint32_t arrayIndex, uint32_t sampleNum)
-+{
-+    uint32_t sampleSlice;
-+    /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF.
-+    if(pState->tileMode == SWR_TILE_MODE_YMAJOR ||
-+       pState->tileMode == SWR_TILE_NONE)
-+    {
-+        uint32_t sampleShift;
-+        switch(pState->numSamples)
-+        {
-+        case 1:
-+            assert(sampleNum == 0);
-+            sampleShift = 0;
-+            break;
-+        case 2:
-+            assert(pState->type == SURFACE_2D);
-+            sampleShift = 1;
-+            break;
-+        case 4:
-+            assert(pState->type == SURFACE_2D);
-+            sampleShift = 2;
-+            break;
-+        case 8:
-+            assert(pState->type == SURFACE_2D);
-+            sampleShift = 3;
-+            break;
-+        case 16:
-+            assert(pState->type == SURFACE_2D);
-+            sampleShift = 4;
-+            break;
-+        default:
-+            assert(0 && "Unsupported sample count");
-+            sampleShift = 0;
-+            break;
-+        }
-+        sampleSlice = (arrayIndex << sampleShift) | sampleNum;
-+    }
-+    else
-+    {
-+        sampleSlice = arrayIndex;
-+    }
-+    return sampleSlice;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes 2D surface offset
-+/// @param x - horizontal offset from start of array slice and lod.
-+/// @param y - vertical offset from start of array slice and lod.
-+/// @param array - array slice index
-+/// @param lod - lod index
-+/// @param pState - surface state
-+/// @param xOffsetBytes - output x offset in bytes.
-+/// @param yOffsetRows - output y offset in bytes.
-+template<bool UseCachedOffsets>
-+INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows)
-+{
-+    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
-+    uint32_t lodOffsetX, lodOffsetY;
-+
-+    if (UseCachedOffsets)
-+    {
-+        lodOffsetX = pState->lodOffsets[0][lod];
-+        lodOffsetY = pState->lodOffsets[1][lod];
-+    }
-+    else
-+    {
-+        ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
-+        ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
-+    }
-+
-+    uint32_t arrayIndex = AdjustArrayIndexForMSAA(pState, array, sampleNum);
-+    xOffsetBytes = (x + lodOffsetX) * info.Bpp;
-+    yOffsetRows = (arrayIndex * pState->qpitch) + lodOffsetY + y;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes 3D surface offset
-+/// @param x - horizontal offset from start of array slice and lod.
-+/// @param y - vertical offset from start of array slice and lod.
-+/// @param z - depth offset from start of array slice and lod.
-+/// @param lod - lod index
-+/// @param pState - surface state
-+/// @param xOffsetBytes - output x offset in bytes.
-+/// @param yOffsetRows - output y offset in rows.
-+/// @param zOffsetSlices - output y offset in slices.
-+template<bool UseCachedOffsets>
-+INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices)
-+{
-+    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
-+    uint32_t lodOffsetX, lodOffsetY;
-+
-+    if (UseCachedOffsets)
-+    {
-+        lodOffsetX = pState->lodOffsets[0][lod];
-+        lodOffsetY = pState->lodOffsets[1][lod];
-+    }
-+    else
-+    {
-+        ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
-+        ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
-+    }
-+
-+    xOffsetBytes = (x + lodOffsetX) * info.Bpp;
-+    yOffsetRows = lodOffsetY + y;
-+    zOffsetSlices = z;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
-+///        and returns final surface address
-+/// @param xOffsetBytes - x offset from base of surface in bytes
-+/// @param yOffsetRows - y offset from base of surface in rows
-+/// @param pState - pointer to the surface state
-+template<typename TTraits>
-+INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
-+{
-+    return ComputeOffset2D<TTraits>(pState->pitch, xOffsetBytes, yOffsetRows);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
-+///        and returns final surface address
-+/// @param xOffsetBytes - x offset from base of surface in bytes
-+/// @param yOffsetRows - y offset from base of surface in rows
-+/// @param pState - pointer to the surface state
-+template<typename TTraits>
-+INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
-+{
-+    return ComputeOffset3D<TTraits>(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
-+///        and returns final surface address
-+/// @param xOffsetBytes - x offset from base of surface in bytes
-+/// @param yOffsetRows - y offset from base of surface in rows
-+/// @param pState - pointer to the surface state
-+INLINE
-+uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
-+{
-+    switch (pState->tileMode)
-+    {
-+    case SWR_TILE_NONE: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, pState);
-+    case SWR_TILE_SWRZ: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, pState);
-+    case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_XMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
-+    case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, pState);
-+    case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_WMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
-+    default: SWR_ASSERT(0, "Unsupported tiling mode");
-+    }
-+    return (uint32_t) NULL;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode
-+///        and returns final surface address
-+/// @param xOffsetBytes - x offset from base of surface in bytes
-+/// @param yOffsetRows - y offset from base of surface in rows
-+/// @param zOffsetSlices - z offset from base of surface in slices
-+/// @param pState - pointer to the surface state
-+INLINE
-+uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
-+{
-+    switch (pState->tileMode)
-+    {
-+    case SWR_TILE_NONE: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
-+    case SWR_TILE_SWRZ: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
-+    case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
-+    default: SWR_ASSERT(0, "Unsupported tiling mode");
-+    }
-+    return (uint32_t) NULL;
-+}
-+
-+template<bool UseCachedOffsets>
-+INLINE
-+uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
-+{
-+    uint32_t offsetX = 0, offsetY = 0, offsetZ = 0;
-+    switch (pState->type)
-+    {
-+    case SURFACE_BUFFER:
-+    case SURFACE_STRUCTURED_BUFFER:
-+        offsetX = x * pState->pitch;
-+        return offsetX;
-+        break;
-+    case SURFACE_1D:
-+        ComputeSurfaceOffset1D<UseCachedOffsets>(x, array, lod, pState, offsetX);
-+        return TileSwizzle2D(offsetX, 0, pState);
-+        break;
-+    case SURFACE_2D:
-+        ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
-+        return TileSwizzle2D(offsetX, offsetY, pState);
-+    case SURFACE_3D:
-+        ComputeSurfaceOffset3D<UseCachedOffsets>(x, y, z, lod, pState, offsetX, offsetY, offsetZ);
-+        return TileSwizzle3D(offsetX, offsetY, offsetZ, pState);
-+        break;
-+    case SURFACE_CUBE:
-+        ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
-+        return TileSwizzle2D(offsetX, offsetY, pState);
-+        break;
-+    default: SWR_ASSERT(0, "Unsupported format");
-+    }
-+
-+    return (uint32_t) NULL;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes surface address at the given location and lod
-+/// @param x - x location in pixels
-+/// @param y - y location in rows
-+/// @param z - z location for 3D surfaces
-+/// @param array - array slice for 1D and 2D surfaces
-+/// @param lod - level of detail
-+/// @param pState - pointer to the surface state
-+template<bool UseCachedOffsets>
-+INLINE
-+void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
-+{
-+    return pState->pBaseAddress + ComputeSurfaceOffset<UseCachedOffsets>(x, y, z, array, sampleNum, lod, pState);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
-new file mode 100644
-index 0000000..9dd4cd2
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
-@@ -0,0 +1,239 @@
-+/****************************************************************************
-+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+*
-+* Permission is hereby granted, free of charge, to any person obtaining a
-+* copy of this software and associated documentation files (the "Software"),
-+* to deal in the Software without restriction, including without limitation
-+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+* and/or sell copies of the Software, and to permit persons to whom the
-+* Software is furnished to do so, subject to the following conditions:
-+*
-+* The above copyright notice and this permission notice (including the next
-+* paragraph) shall be included in all copies or substantial portions of the
-+* Software.
-+*
-+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+* IN THE SOFTWARE.
-+* 
-+* @file tilingtraits.h
-+* 
-+* @brief Tiling traits.
-+* 
-+******************************************************************************/
-+#pragma once
-+
-+#include "core/state.h"
-+
-+template<SWR_TILE_MODE mode, int>
-+struct TilingTraits
-+{
-+    static const SWR_TILE_MODE TileMode{ mode };
-+    static UINT GetCu() { SWR_ASSERT(0); return 0; }
-+    static UINT GetCv() { SWR_ASSERT(0); return 0; }
-+    static UINT GetCr() { SWR_ASSERT(0); return 0; }
-+    static UINT GetTileIDShift() { SWR_ASSERT(0); return 0; }
-+
-+    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
-+    static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
-+    static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
-+};
-+
-+template<int X> struct TilingTraits <SWR_TILE_NONE, X>
-+{
-+    static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE };
-+    static UINT GetCu() { return 0; }
-+    static UINT GetCv() { return 0; }
-+    static UINT GetCr() { return 0; }
-+    static UINT GetTileIDShift() { return 0; }
-+    static UINT GetPdepX() { return 0x00; }
-+    static UINT GetPdepY() { return 0x00; }
-+};
-+
-+template<> struct TilingTraits <SWR_TILE_SWRZ, 8>
-+{
-+    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
-+    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; }
-+    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
-+    static UINT GetCr() { return 0; }
-+    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; }
-+
-+    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
-+    static UINT GetPdepX() { SWR_ASSERT(0); return 0x00; }
-+    static UINT GetPdepY() { SWR_ASSERT(0); return 0x00; }
-+};
-+
-+template<> struct TilingTraits <SWR_TILE_SWRZ, 32>
-+{
-+    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
-+    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; }
-+    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
-+    static UINT GetCr() { return 0; }
-+    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; }
-+
-+    static UINT GetPdepX() { return 0x37; }
-+    static UINT GetPdepY() { return 0xC8; }
-+};
-+
-+template<> struct TilingTraits <SWR_TILE_SWRZ, 128>
-+{
-+    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
-+    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; }
-+    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
-+    static UINT GetCr() { return 0; }
-+    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; }
-+
-+    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
-+    static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
-+    static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
-+};
-+
-+// y-major tiling layout unaffected by element size
-+template<int X> struct TilingTraits <SWR_TILE_MODE_YMAJOR, X>
-+{
-+    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR };
-+    static UINT GetCu() { return 7; }
-+    static UINT GetCv() { return 5; }
-+    static UINT GetCr() { return 0; }
-+    static UINT GetTileIDShift() { return 12; }
-+
-+    static UINT GetPdepX() { return 0xe0f; }
-+    static UINT GetPdepY() { return 0x1f0; }
-+};
-+
-+// x-major tiling layout unaffected by element size
-+template<int X> struct TilingTraits <SWR_TILE_MODE_XMAJOR, X>
-+{
-+    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR };
-+    static UINT GetCu() { return 9; }
-+    static UINT GetCv() { return 3; }
-+    static UINT GetCr() { return 0; }
-+    static UINT GetTileIDShift() { return 12; }
-+
-+    static UINT GetPdepX() { return 0x1ff; }
-+    static UINT GetPdepY() { return 0xe00; }
-+};
-+
-+template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
-+{
-+    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR };
-+    static UINT GetCu() { return 6; }
-+    static UINT GetCv() { return 6; }
-+    static UINT GetCr() { return 0; }
-+    static UINT GetTileIDShift() { return 12; }
-+
-+    static UINT GetPdepX() { return 0xe15; }
-+    static UINT GetPdepY() { return 0x1ea; }
-+};
-+
-+INLINE
-+UINT pdep_u32(UINT a, UINT mask)
-+{
-+#if KNOB_ARCH==KNOB_ARCH_AVX2
-+    return _pdep_u32(a, mask);
-+#else
-+    UINT result = 0;
-+
-+    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
-+    // using bsf instead of funky loop
-+    DWORD maskIndex;
-+    while (_BitScanForward(&maskIndex, mask))
-+    {
-+        // 1. isolate lowest set bit of mask
-+        const UINT lowest = 1 << maskIndex;
-+
-+        // 2. populate LSB from src
-+        const UINT LSB = (UINT)((int)(a << 31) >> 31);
-+
-+        // 3. copy bit from mask
-+        result |= LSB & lowest;
-+
-+        // 4. clear lowest bit
-+        mask &= ~lowest;
-+
-+        // 5. prepare for next iteration
-+        a >>= 1;
-+    }
-+
-+    return result;
-+#endif
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes the tileID for 2D tiled surfaces
-+/// @param pitch - surface pitch in bytes
-+/// @param tileX - x offset in tiles
-+/// @param tileY - y offset in tiles
-+template<typename TTraits>
-+INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY)
-+{
-+    UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX;
-+    return tileID << TTraits::GetTileIDShift();
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes the tileID for 3D tiled surfaces
-+/// @param qpitch - surface qpitch in rows
-+/// @param pitch - surface pitch in bytes
-+/// @param tileX - x offset in tiles
-+/// @param tileY - y offset in tiles
-+/// @param tileZ - y offset in tiles
-+template<typename TTraits>
-+INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ)
-+{
-+    UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX;
-+    return tileID << TTraits::GetTileIDShift();
-+}
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes the byte offset for 2D tiled surfaces
-+/// @param pitch - surface pitch in bytes
-+/// @param x - x offset in bytes
-+/// @param y - y offset in rows
-+template<typename TTraits>
-+INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y)
-+{
-+    UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
-+    UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
-+    UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
-+    return (tileID | xSwizzle | ySwizzle);
-+}
-+
-+#if KNOB_ARCH <= KNOB_ARCH_AVX
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes the byte offset for 2D tiled surfaces. Specialization
-+///        for tile-y surfaces that uses bit twiddling instead of pdep emulation.
-+/// @param pitch - surface pitch in bytes
-+/// @param x - x offset in bytes
-+/// @param y - y offset in rows
-+template<>
-+INLINE UINT ComputeOffset2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(UINT pitch, UINT x, UINT y)
-+{
-+    typedef TilingTraits<SWR_TILE_MODE_YMAJOR, 32> TTraits;
-+
-+    UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
-+    UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf);
-+    UINT ySwizzle = (y << 4) & 0x1f0;
-+    return (tileID | xSwizzle | ySwizzle);
-+}
-+#endif
-+
-+//////////////////////////////////////////////////////////////////////////
-+/// @brief Computes the byte offset for 3D tiled surfaces
-+/// @param qpitch - depth pitch in rows
-+/// @param pitch - surface pitch in bytes
-+/// @param x - x offset in bytes
-+/// @param y - y offset in rows
-+/// @param z - y offset in slices
-+template<typename TTraits>
-+INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z)
-+{
-+    UINT tileID = ComputeTileOffset3D<TTraits>(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr());
-+    UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
-+    UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
-+    return (tileID | xSwizzle | ySwizzle);
-+}
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
-new file mode 100644
-index 0000000..a6aa81b
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
-@@ -0,0 +1,79 @@
-+# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+#
-+# Permission is hereby granted, free of charge, to any person obtaining a
-+# copy of this software and associated documentation files (the "Software"),
-+# to deal in the Software without restriction, including without limitation
-+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+# and/or sell copies of the Software, and to permit persons to whom the
-+# Software is furnished to do so, subject to the following conditions:
-+#
-+# The above copyright notice and this permission notice (including the next
-+# paragraph) shall be included in all copies or substantial portions of the
-+# Software.
-+#
-+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+# IN THE SOFTWARE.
-+
-+# Python source
-+from __future__ import print_function
-+import os
-+import sys
-+import knob_defs
-+from mako.template import Template
-+from mako.exceptions import RichTraceback
-+
-+def write_template_to_string(template_filename, **kwargs):
-+    try:
-+        template = Template(filename=template_filename)
-+        # Split + Join fixes line-endings for whatever platform you are using
-+        return '\n'.join(template.render(**kwargs).splitlines())
-+    except:
-+        traceback = RichTraceback()
-+        for (filename, lineno, function, line) in traceback.traceback:
-+            print("File %s, line %s, in %s" % (filename, lineno, function))
-+            print(line, "\n")
-+        print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error))
-+
-+def write_template_to_file(template_filename, output_filename, **kwargs):
-+    with open(output_filename, "w") as outfile:
-+        print(write_template_to_string(template_filename, **kwargs), file=outfile)
-+
-+def main(args=sys.argv[1:]):
-+    if len(args) != 1:
-+        print('Usage:', sys.argv[0], '<output_directory>', file=sys.stderr)
-+        return 1
-+
-+    output_dir = args[0]
-+    if not os.path.isdir(output_dir):
-+        if os.path.exists(output_dir):
-+            print('ERROR: Invalid output directory:', output_dir, file=sys.stderr)
-+            return 1
-+
-+        try:
-+            os.makedirs(output_dir)
-+        except:
-+            print('ERROR: Could not create output directory:', output_dir, file=sys.stderr)
-+            return 1
-+
-+    # Output path exists, now just run the template
-+    template_file = os.sep.join([sys.path[0], 'templates', 'knobs.template'])
-+    output_file = os.sep.join([output_dir, 'gen_knobs.cpp'])
-+    output_header = os.sep.join([output_dir, 'gen_knobs.h'])
-+
-+    for f in [output_header, output_file]:
-+        write_template_to_file(template_file, f,
-+                filename='gen_knobs',
-+                knobs=knob_defs.KNOBS,
-+                includes=['core/knobs_init.h'],
-+                gen_header=True if f == output_header else False)
-+
-+    return 0
-+
-+if __name__ == '__main__':
-+    sys.exit(main())
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
-new file mode 100644
-index 0000000..0a64953
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
-@@ -0,0 +1,212 @@
-+﻿# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-+#
-+# Permission is hereby granted, free of charge, to any person obtaining a
-+# copy of this software and associated documentation files (the "Software"),
-+# to deal in the Software without restriction, including without limitation
-+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+# and/or sell copies of the Software, and to permit persons to whom the
-+# Software is furnished to do so, subject to the following conditions:
-+#
-+# The above copyright notice and this permission notice (including the next
-+# paragraph) shall be included in all copies or substantial portions of the
-+# Software.
-+#
-+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-+# IN THE SOFTWARE.
-+
-+# Python source
-+KNOBS = [
-+    ['ENABLE_ASSERT_DIALOGS', {
-+        'type'      : 'bool',
-+        'default'   : 'true',
-+        'desc'      : ['Use dialogs when asserts fire.',
-+                       'Asserts are only enabled in debug builds'],
-+    }],
-+
-+    ['USE_GENERIC_STORETILE', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Always use generic function for performing StoreTile.',
-+                       'Will be slightly slower than using optimized (jitted) path'],
-+    }],
-+
-+    ['SINGLE_THREADED', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['If enabled will perform all rendering on the API thread.',
-+                       'This is useful mainly for debugging purposes.'],
-+    }],
-+
-+    ['FAST_CLEAR', {
-+        'type'      : 'bool',
-+        'default'   : 'true',
-+        'desc'      : ['Replace 3D primitive execute with a SWRClearRT operation and',
-+                       'defer clear execution to first backend op on hottile, or hottile store'],
-+    }],
-+
-+    ['MAX_NUMA_NODES', {
-+        'type'      : 'uint32_t',
-+        'default'   : '0',
-+        'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
-+                       '  0 == ALL NUMA-nodes in the system',
-+                       '  N == Use at most N NUMA-nodes for rendering'],
-+    }],
-+
-+    ['MAX_CORES_PER_NUMA_NODE', {
-+        'type'      : 'uint32_t',
-+        'default'   : '0',
-+        'desc'      : ['Maximum # of cores per NUMA-node used for worker threads.',
-+                       '  0 == ALL non-API thread cores per NUMA-node',
-+                       '  N == Use at most N cores per NUMA-node'],
-+    }],
-+
-+    ['MAX_THREADS_PER_CORE', {
-+        'type'      : 'uint32_t',
-+        'default'   : '1',
-+        'desc'      : ['Maximum # of (hyper)threads per physical core used for worker threads.',
-+                       '  0 == ALL hyper-threads per core',
-+                       '  N == Use at most N hyper-threads per physical core'],
-+    }],
-+
-+    ['BUCKETS_START_FRAME', {
-+        'type'      : 'uint32_t',
-+        'default'   : '1200',
-+        'desc'      : ['Frame from when to start saving buckets data.',
-+                       '',
-+                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
-+                       'for this to have an effect.'],
-+    }],
-+
-+    ['BUCKETS_END_FRAME', {
-+        'type'      : 'uint32_t',
-+        'default'   : '1400',
-+        'desc'      : ['Frame at which to stop saving buckets data.',
-+                       '',
-+                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
-+                       'for this to have an effect.'],
-+    }],
-+
-+    ['TOSS_DRAW', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Disable per-draw execution',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['TOSS_QUEUE_FE', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Stop per-draw execution at worker FE',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['TOSS_FETCH', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Stop per-draw execution at vertex fetch',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['TOSS_IA', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Stop per-draw execution at input assembler',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['TOSS_VS', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Stop per-draw execution at vertex shader',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['TOSS_SETUP_TRIS', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Stop per-draw execution at primitive setup',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['TOSS_BIN_TRIS', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Stop per-draw execution at primitive binning',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['TOSS_RS', {
-+        'type'      : 'bool',
-+        'default'   : 'false',
-+        'desc'      : ['Stop per-draw execution at rasterizer',
-+                       '',
-+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-+    }],
-+
-+    ['WORKER_SPIN_LOOP_COUNT', {
-+        'type'      : 'uint32_t',
-+        'default'   : '5000',
-+        'desc'      : ['Number of spin-loop iterations worker threads will perform',
-+                       'before going to sleep when waiting for work'],
-+    }],
-+
-+    ['MAX_DRAWS_IN_FLIGHT', {
-+        'type'      : 'uint32_t',
-+        'default'   : '160',
-+        'desc'      : ['Maximum number of draws outstanding before API thread blocks.'],
-+    }],
-+
-+    ['MAX_PRIMS_PER_DRAW', {
-+       'type'       : 'uint32_t',
-+       'default'    : '2040',
-+       'desc'       : ['Maximum primitives in a single Draw().',
-+                       'Larger primitives are split into smaller Draw calls.',
-+                       'Should be a multiple of (3 * vectorWidth).'],
-+    }],
-+
-+    ['MAX_TESS_PRIMS_PER_DRAW', {
-+       'type'       : 'uint32_t',
-+       'default'    : '16',
-+       'desc'       : ['Maximum primitives in a single Draw() with tessellation enabled.',
-+                       'Larger primitives are split into smaller Draw calls.',
-+                       'Should be a multiple of (vectorWidth).'],
-+    }],
-+
-+    ['MAX_FRAC_ODD_TESS_FACTOR', {
-+        'type'      : 'float',
-+        'default'   : '63.0f',
-+        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
-+    }],                
-+
-+    ['MAX_FRAC_EVEN_TESS_FACTOR', {
-+        'type'      : 'float',
-+        'default'   : '64.0f',
-+        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
-+    }],                
-+
-+    ['MAX_INTEGER_TESS_FACTOR', {
-+        'type'      : 'uint32_t',
-+        'default'   : '64',
-+        'desc'      : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
-+    }],                
-+
-+    ['DUMP_SHADER_IR', {
-+       'type'       : 'bool',
-+       'default'    : 'false',
-+       'desc'       : ['Dumps shader LLVM IR at various stages of jit compilation.'],
-+    }],
-+
-+
-+]
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py
-new file mode 100644
-index 0000000..d963848
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py
-@@ -0,0 +1,8 @@
-+# mako/__init__.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+
-+__version__ = '1.0.1'
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py
-new file mode 100644
-index 0000000..efbc4fc
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py
-@@ -0,0 +1,845 @@
-+# mako/_ast_util.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""
-+    ast
-+    ~~~
-+
-+    The `ast` module helps Python applications to process trees of the Python
-+    abstract syntax grammar.  The abstract syntax itself might change with
-+    each Python release; this module helps to find out programmatically what
-+    the current grammar looks like and allows modifications of it.
-+
-+    An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as
-+    a flag to the `compile()` builtin function or by using the `parse()`
-+    function from this module.  The result will be a tree of objects whose
-+    classes all inherit from `ast.AST`.
-+
-+    A modified abstract syntax tree can be compiled into a Python code object
-+    using the built-in `compile()` function.
-+
-+    Additionally various helper functions are provided that make working with
-+    the trees simpler.  The main intention of the helper functions and this
-+    module in general is to provide an easy to use interface for libraries
-+    that work tightly with the python syntax (template engines for example).
-+
-+
-+    :copyright: Copyright 2008 by Armin Ronacher.
-+    :license: Python License.
-+"""
-+from _ast import *
-+from mako.compat import arg_stringname
-+
-+BOOLOP_SYMBOLS = {
-+    And: 'and',
-+    Or: 'or'
-+}
-+
-+BINOP_SYMBOLS = {
-+    Add: '+',
-+    Sub: '-',
-+    Mult: '*',
-+    Div: '/',
-+    FloorDiv: '//',
-+    Mod: '%',
-+    LShift: '<<',
-+    RShift: '>>',
-+    BitOr: '|',
-+    BitAnd: '&',
-+    BitXor: '^'
-+}
-+
-+CMPOP_SYMBOLS = {
-+    Eq: '==',
-+    Gt: '>',
-+    GtE: '>=',
-+    In: 'in',
-+    Is: 'is',
-+    IsNot: 'is not',
-+    Lt: '<',
-+    LtE: '<=',
-+    NotEq: '!=',
-+    NotIn: 'not in'
-+}
-+
-+UNARYOP_SYMBOLS = {
-+    Invert: '~',
-+    Not: 'not',
-+    UAdd: '+',
-+    USub: '-'
-+}
-+
-+ALL_SYMBOLS = {}
-+ALL_SYMBOLS.update(BOOLOP_SYMBOLS)
-+ALL_SYMBOLS.update(BINOP_SYMBOLS)
-+ALL_SYMBOLS.update(CMPOP_SYMBOLS)
-+ALL_SYMBOLS.update(UNARYOP_SYMBOLS)
-+
-+
-+def parse(expr, filename='<unknown>', mode='exec'):
-+    """Parse an expression into an AST node."""
-+    return compile(expr, filename, mode, PyCF_ONLY_AST)
-+
-+
-+def to_source(node, indent_with=' ' * 4):
-+    """
-+    This function can convert a node tree back into python sourcecode.  This
-+    is useful for debugging purposes, especially if you're dealing with custom
-+    asts not generated by python itself.
-+
-+    It could be that the sourcecode is evaluable when the AST itself is not
-+    compilable / evaluable.  The reason for this is that the AST contains some
-+    more data than regular sourcecode does, which is dropped during
-+    conversion.
-+
-+    Each level of indentation is replaced with `indent_with`.  Per default this
-+    parameter is equal to four spaces as suggested by PEP 8, but it might be
-+    adjusted to match the application's styleguide.
-+    """
-+    generator = SourceGenerator(indent_with)
-+    generator.visit(node)
-+    return ''.join(generator.result)
-+
-+
-+def dump(node):
-+    """
-+    A very verbose representation of the node passed.  This is useful for
-+    debugging purposes.
-+    """
-+    def _format(node):
-+        if isinstance(node, AST):
-+            return '%s(%s)' % (node.__class__.__name__,
-+                               ', '.join('%s=%s' % (a, _format(b))
-+                                         for a, b in iter_fields(node)))
-+        elif isinstance(node, list):
-+            return '[%s]' % ', '.join(_format(x) for x in node)
-+        return repr(node)
-+    if not isinstance(node, AST):
-+        raise TypeError('expected AST, got %r' % node.__class__.__name__)
-+    return _format(node)
-+
-+
-+def copy_location(new_node, old_node):
-+    """
-+    Copy the source location hint (`lineno` and `col_offset`) from the
-+    old to the new node if possible and return the new one.
-+    """
-+    for attr in 'lineno', 'col_offset':
-+        if attr in old_node._attributes and attr in new_node._attributes \
-+           and hasattr(old_node, attr):
-+            setattr(new_node, attr, getattr(old_node, attr))
-+    return new_node
-+
-+
-+def fix_missing_locations(node):
-+    """
-+    Some nodes require a line number and the column offset.  Without that
-+    information the compiler will abort the compilation.  Because it can be
-+    a dull task to add appropriate line numbers and column offsets when
-+    adding new nodes this function can help.  It copies the line number and
-+    column offset of the parent node to the child nodes without this
-+    information.
-+
-+    Unlike `copy_location` this works recursive and won't touch nodes that
-+    already have a location information.
-+    """
-+    def _fix(node, lineno, col_offset):
-+        if 'lineno' in node._attributes:
-+            if not hasattr(node, 'lineno'):
-+                node.lineno = lineno
-+            else:
-+                lineno = node.lineno
-+        if 'col_offset' in node._attributes:
-+            if not hasattr(node, 'col_offset'):
-+                node.col_offset = col_offset
-+            else:
-+                col_offset = node.col_offset
-+        for child in iter_child_nodes(node):
-+            _fix(child, lineno, col_offset)
-+    _fix(node, 1, 0)
-+    return node
-+
-+
-+def increment_lineno(node, n=1):
-+    """
-+    Increment the line numbers of all nodes by `n` if they have line number
-+    attributes.  This is useful to "move code" to a different location in a
-+    file.
-+    """
-+    for node in zip((node,), walk(node)):
-+        if 'lineno' in node._attributes:
-+            node.lineno = getattr(node, 'lineno', 0) + n
-+
-+
-+def iter_fields(node):
-+    """Iterate over all fields of a node, only yielding existing fields."""
-+    # CPython 2.5 compat
-+    if not hasattr(node, '_fields') or not node._fields:
-+        return
-+    for field in node._fields:
-+        try:
-+            yield field, getattr(node, field)
-+        except AttributeError:
-+            pass
-+
-+
-+def get_fields(node):
-+    """Like `iter_fiels` but returns a dict."""
-+    return dict(iter_fields(node))
-+
-+
-+def iter_child_nodes(node):
-+    """Iterate over all child nodes or a node."""
-+    for name, field in iter_fields(node):
-+        if isinstance(field, AST):
-+            yield field
-+        elif isinstance(field, list):
-+            for item in field:
-+                if isinstance(item, AST):
-+                    yield item
-+
-+
-+def get_child_nodes(node):
-+    """Like `iter_child_nodes` but returns a list."""
-+    return list(iter_child_nodes(node))
-+
-+
-+def get_compile_mode(node):
-+    """
-+    Get the mode for `compile` of a given node.  If the node is not a `mod`
-+    node (`Expression`, `Module` etc.) a `TypeError` is thrown.
-+    """
-+    if not isinstance(node, mod):
-+        raise TypeError('expected mod node, got %r' % node.__class__.__name__)
-+    return {
-+        Expression: 'eval',
-+        Interactive: 'single'
-+    }.get(node.__class__, 'expr')
-+
-+
-+def get_docstring(node):
-+    """
-+    Return the docstring for the given node or `None` if no docstring can be
-+    found.  If the node provided does not accept docstrings a `TypeError`
-+    will be raised.
-+    """
-+    if not isinstance(node, (FunctionDef, ClassDef, Module)):
-+        raise TypeError("%r can't have docstrings" % node.__class__.__name__)
-+    if node.body and isinstance(node.body[0], Str):
-+        return node.body[0].s
-+
-+
-+def walk(node):
-+    """
-+    Iterate over all nodes.  This is useful if you only want to modify nodes in
-+    place and don't care about the context or the order the nodes are returned.
-+    """
-+    from collections import deque
-+    todo = deque([node])
-+    while todo:
-+        node = todo.popleft()
-+        todo.extend(iter_child_nodes(node))
-+        yield node
-+
-+
-+class NodeVisitor(object):
-+    """
-+    Walks the abstract syntax tree and call visitor functions for every node
-+    found.  The visitor functions may return values which will be forwarded
-+    by the `visit` method.
-+
-+    Per default the visitor functions for the nodes are ``'visit_'`` +
-+    class name of the node.  So a `TryFinally` node visit function would
-+    be `visit_TryFinally`.  This behavior can be changed by overriding
-+    the `get_visitor` function.  If no visitor function exists for a node
-+    (return value `None`) the `generic_visit` visitor is used instead.
-+
-+    Don't use the `NodeVisitor` if you want to apply changes to nodes during
-+    traversing.  For this a special visitor exists (`NodeTransformer`) that
-+    allows modifications.
-+    """
-+
-+    def get_visitor(self, node):
-+        """
-+        Return the visitor function for this node or `None` if no visitor
-+        exists for this node.  In that case the generic visit function is
-+        used instead.
-+        """
-+        method = 'visit_' + node.__class__.__name__
-+        return getattr(self, method, None)
-+
-+    def visit(self, node):
-+        """Visit a node."""
-+        f = self.get_visitor(node)
-+        if f is not None:
-+            return f(node)
-+        return self.generic_visit(node)
-+
-+    def generic_visit(self, node):
-+        """Called if no explicit visitor function exists for a node."""
-+        for field, value in iter_fields(node):
-+            if isinstance(value, list):
-+                for item in value:
-+                    if isinstance(item, AST):
-+                        self.visit(item)
-+            elif isinstance(value, AST):
-+                self.visit(value)
-+
-+
-+class NodeTransformer(NodeVisitor):
-+    """
-+    Walks the abstract syntax tree and allows modifications of nodes.
-+
-+    The `NodeTransformer` will walk the AST and use the return value of the
-+    visitor functions to replace or remove the old node.  If the return
-+    value of the visitor function is `None` the node will be removed
-+    from the previous location otherwise it's replaced with the return
-+    value.  The return value may be the original node in which case no
-+    replacement takes place.
-+
-+    Here an example transformer that rewrites all `foo` to `data['foo']`::
-+
-+        class RewriteName(NodeTransformer):
-+
-+            def visit_Name(self, node):
-+                return copy_location(Subscript(
-+                    value=Name(id='data', ctx=Load()),
-+                    slice=Index(value=Str(s=node.id)),
-+                    ctx=node.ctx
-+                ), node)
-+
-+    Keep in mind that if the node you're operating on has child nodes
-+    you must either transform the child nodes yourself or call the generic
-+    visit function for the node first.
-+
-+    Nodes that were part of a collection of statements (that applies to
-+    all statement nodes) may also return a list of nodes rather than just
-+    a single node.
-+
-+    Usually you use the transformer like this::
-+
-+        node = YourTransformer().visit(node)
-+    """
-+
-+    def generic_visit(self, node):
-+        for field, old_value in iter_fields(node):
-+            old_value = getattr(node, field, None)
-+            if isinstance(old_value, list):
-+                new_values = []
-+                for value in old_value:
-+                    if isinstance(value, AST):
-+                        value = self.visit(value)
-+                        if value is None:
-+                            continue
-+                        elif not isinstance(value, AST):
-+                            new_values.extend(value)
-+                            continue
-+                    new_values.append(value)
-+                old_value[:] = new_values
-+            elif isinstance(old_value, AST):
-+                new_node = self.visit(old_value)
-+                if new_node is None:
-+                    delattr(node, field)
-+                else:
-+                    setattr(node, field, new_node)
-+        return node
-+
-+
-+class SourceGenerator(NodeVisitor):
-+    """
-+    This visitor is able to transform a well formed syntax tree into python
-+    sourcecode.  For more details have a look at the docstring of the
-+    `node_to_source` function.
-+    """
-+
-+    def __init__(self, indent_with):
-+        self.result = []
-+        self.indent_with = indent_with
-+        self.indentation = 0
-+        self.new_lines = 0
-+
-+    def write(self, x):
-+        if self.new_lines:
-+            if self.result:
-+                self.result.append('\n' * self.new_lines)
-+            self.result.append(self.indent_with * self.indentation)
-+            self.new_lines = 0
-+        self.result.append(x)
-+
-+    def newline(self, n=1):
-+        self.new_lines = max(self.new_lines, n)
-+
-+    def body(self, statements):
-+        self.new_line = True
-+        self.indentation += 1
-+        for stmt in statements:
-+            self.visit(stmt)
-+        self.indentation -= 1
-+
-+    def body_or_else(self, node):
-+        self.body(node.body)
-+        if node.orelse:
-+            self.newline()
-+            self.write('else:')
-+            self.body(node.orelse)
-+
-+    def signature(self, node):
-+        want_comma = []
-+        def write_comma():
-+            if want_comma:
-+                self.write(', ')
-+            else:
-+                want_comma.append(True)
-+
-+        padding = [None] * (len(node.args) - len(node.defaults))
-+        for arg, default in zip(node.args, padding + node.defaults):
-+            write_comma()
-+            self.visit(arg)
-+            if default is not None:
-+                self.write('=')
-+                self.visit(default)
-+        if node.vararg is not None:
-+            write_comma()
-+            self.write('*' + arg_stringname(node.vararg))
-+        if node.kwarg is not None:
-+            write_comma()
-+            self.write('**' + arg_stringname(node.kwarg))
-+
-+    def decorators(self, node):
-+        for decorator in node.decorator_list:
-+            self.newline()
-+            self.write('@')
-+            self.visit(decorator)
-+
-+    # Statements
-+
-+    def visit_Assign(self, node):
-+        self.newline()
-+        for idx, target in enumerate(node.targets):
-+            if idx:
-+                self.write(', ')
-+            self.visit(target)
-+        self.write(' = ')
-+        self.visit(node.value)
-+
-+    def visit_AugAssign(self, node):
-+        self.newline()
-+        self.visit(node.target)
-+        self.write(BINOP_SYMBOLS[type(node.op)] + '=')
-+        self.visit(node.value)
-+
-+    def visit_ImportFrom(self, node):
-+        self.newline()
-+        self.write('from %s%s import ' % ('.' * node.level, node.module))
-+        for idx, item in enumerate(node.names):
-+            if idx:
-+                self.write(', ')
-+            self.write(item)
-+
-+    def visit_Import(self, node):
-+        self.newline()
-+        for item in node.names:
-+            self.write('import ')
-+            self.visit(item)
-+
-+    def visit_Expr(self, node):
-+        self.newline()
-+        self.generic_visit(node)
-+
-+    def visit_FunctionDef(self, node):
-+        self.newline(n=2)
-+        self.decorators(node)
-+        self.newline()
-+        self.write('def %s(' % node.name)
-+        self.signature(node.args)
-+        self.write('):')
-+        self.body(node.body)
-+
-+    def visit_ClassDef(self, node):
-+        have_args = []
-+        def paren_or_comma():
-+            if have_args:
-+                self.write(', ')
-+            else:
-+                have_args.append(True)
-+                self.write('(')
-+
-+        self.newline(n=3)
-+        self.decorators(node)
-+        self.newline()
-+        self.write('class %s' % node.name)
-+        for base in node.bases:
-+            paren_or_comma()
-+            self.visit(base)
-+        # XXX: the if here is used to keep this module compatible
-+        #      with python 2.6.
-+        if hasattr(node, 'keywords'):
-+            for keyword in node.keywords:
-+                paren_or_comma()
-+                self.write(keyword.arg + '=')
-+                self.visit(keyword.value)
-+            if node.starargs is not None:
-+                paren_or_comma()
-+                self.write('*')
-+                self.visit(node.starargs)
-+            if node.kwargs is not None:
-+                paren_or_comma()
-+                self.write('**')
-+                self.visit(node.kwargs)
-+        self.write(have_args and '):' or ':')
-+        self.body(node.body)
-+
-+    def visit_If(self, node):
-+        self.newline()
-+        self.write('if ')
-+        self.visit(node.test)
-+        self.write(':')
-+        self.body(node.body)
-+        while True:
-+            else_ = node.orelse
-+            if len(else_) == 1 and isinstance(else_[0], If):
-+                node = else_[0]
-+                self.newline()
-+                self.write('elif ')
-+                self.visit(node.test)
-+                self.write(':')
-+                self.body(node.body)
-+            else:
-+                self.newline()
-+                self.write('else:')
-+                self.body(else_)
-+                break
-+
-+    def visit_For(self, node):
-+        self.newline()
-+        self.write('for ')
-+        self.visit(node.target)
-+        self.write(' in ')
-+        self.visit(node.iter)
-+        self.write(':')
-+        self.body_or_else(node)
-+
-+    def visit_While(self, node):
-+        self.newline()
-+        self.write('while ')
-+        self.visit(node.test)
-+        self.write(':')
-+        self.body_or_else(node)
-+
-+    def visit_With(self, node):
-+        self.newline()
-+        self.write('with ')
-+        self.visit(node.context_expr)
-+        if node.optional_vars is not None:
-+            self.write(' as ')
-+            self.visit(node.optional_vars)
-+        self.write(':')
-+        self.body(node.body)
-+
-+    def visit_Pass(self, node):
-+        self.newline()
-+        self.write('pass')
-+
-+    def visit_Print(self, node):
-+        # XXX: python 2.6 only
-+        self.newline()
-+        self.write('print ')
-+        want_comma = False
-+        if node.dest is not None:
-+            self.write(' >> ')
-+            self.visit(node.dest)
-+            want_comma = True
-+        for value in node.values:
-+            if want_comma:
-+                self.write(', ')
-+            self.visit(value)
-+            want_comma = True
-+        if not node.nl:
-+            self.write(',')
-+
-+    def visit_Delete(self, node):
-+        self.newline()
-+        self.write('del ')
-+        for idx, target in enumerate(node):
-+            if idx:
-+                self.write(', ')
-+            self.visit(target)
-+
-+    def visit_TryExcept(self, node):
-+        self.newline()
-+        self.write('try:')
-+        self.body(node.body)
-+        for handler in node.handlers:
-+            self.visit(handler)
-+
-+    def visit_TryFinally(self, node):
-+        self.newline()
-+        self.write('try:')
-+        self.body(node.body)
-+        self.newline()
-+        self.write('finally:')
-+        self.body(node.finalbody)
-+
-+    def visit_Global(self, node):
-+        self.newline()
-+        self.write('global ' + ', '.join(node.names))
-+
-+    def visit_Nonlocal(self, node):
-+        self.newline()
-+        self.write('nonlocal ' + ', '.join(node.names))
-+
-+    def visit_Return(self, node):
-+        self.newline()
-+        self.write('return ')
-+        self.visit(node.value)
-+
-+    def visit_Break(self, node):
-+        self.newline()
-+        self.write('break')
-+
-+    def visit_Continue(self, node):
-+        self.newline()
-+        self.write('continue')
-+
-+    def visit_Raise(self, node):
-+        # XXX: Python 2.6 / 3.0 compatibility
-+        self.newline()
-+        self.write('raise')
-+        if hasattr(node, 'exc') and node.exc is not None:
-+            self.write(' ')
-+            self.visit(node.exc)
-+            if node.cause is not None:
-+                self.write(' from ')
-+                self.visit(node.cause)
-+        elif hasattr(node, 'type') and node.type is not None:
-+            self.visit(node.type)
-+            if node.inst is not None:
-+                self.write(', ')
-+                self.visit(node.inst)
-+            if node.tback is not None:
-+                self.write(', ')
-+                self.visit(node.tback)
-+
-+    # Expressions
-+
-+    def visit_Attribute(self, node):
-+        self.visit(node.value)
-+        self.write('.' + node.attr)
-+
-+    def visit_Call(self, node):
-+        want_comma = []
-+        def write_comma():
-+            if want_comma:
-+                self.write(', ')
-+            else:
-+                want_comma.append(True)
-+
-+        self.visit(node.func)
-+        self.write('(')
-+        for arg in node.args:
-+            write_comma()
-+            self.visit(arg)
-+        for keyword in node.keywords:
-+            write_comma()
-+            self.write(keyword.arg + '=')
-+            self.visit(keyword.value)
-+        if node.starargs is not None:
-+            write_comma()
-+            self.write('*')
-+            self.visit(node.starargs)
-+        if node.kwargs is not None:
-+            write_comma()
-+            self.write('**')
-+            self.visit(node.kwargs)
-+        self.write(')')
-+
-+    def visit_Name(self, node):
-+        self.write(node.id)
-+
-+    def visit_NameConstant(self, node):
-+        self.write(str(node.value))
-+
-+    def visit_arg(self, node):
-+        self.write(node.arg)
-+
-+    def visit_Str(self, node):
-+        self.write(repr(node.s))
-+
-+    def visit_Bytes(self, node):
-+        self.write(repr(node.s))
-+
-+    def visit_Num(self, node):
-+        self.write(repr(node.n))
-+
-+    def visit_Tuple(self, node):
-+        self.write('(')
-+        idx = -1
-+        for idx, item in enumerate(node.elts):
-+            if idx:
-+                self.write(', ')
-+            self.visit(item)
-+        self.write(idx and ')' or ',)')
-+
-+    def sequence_visit(left, right):
-+        def visit(self, node):
-+            self.write(left)
-+            for idx, item in enumerate(node.elts):
-+                if idx:
-+                    self.write(', ')
-+                self.visit(item)
-+            self.write(right)
-+        return visit
-+
-+    visit_List = sequence_visit('[', ']')
-+    visit_Set = sequence_visit('{', '}')
-+    del sequence_visit
-+
-+    def visit_Dict(self, node):
-+        self.write('{')
-+        for idx, (key, value) in enumerate(zip(node.keys, node.values)):
-+            if idx:
-+                self.write(', ')
-+            self.visit(key)
-+            self.write(': ')
-+            self.visit(value)
-+        self.write('}')
-+
-+    def visit_BinOp(self, node):
-+        self.write('(')
-+        self.visit(node.left)
-+        self.write(' %s ' % BINOP_SYMBOLS[type(node.op)])
-+        self.visit(node.right)
-+        self.write(')')
-+
-+    def visit_BoolOp(self, node):
-+        self.write('(')
-+        for idx, value in enumerate(node.values):
-+            if idx:
-+                self.write(' %s ' % BOOLOP_SYMBOLS[type(node.op)])
-+            self.visit(value)
-+        self.write(')')
-+
-+    def visit_Compare(self, node):
-+        self.write('(')
-+        self.visit(node.left)
-+        for op, right in zip(node.ops, node.comparators):
-+            self.write(' %s ' % CMPOP_SYMBOLS[type(op)])
-+            self.visit(right)
-+        self.write(')')
-+
-+    def visit_UnaryOp(self, node):
-+        self.write('(')
-+        op = UNARYOP_SYMBOLS[type(node.op)]
-+        self.write(op)
-+        if op == 'not':
-+            self.write(' ')
-+        self.visit(node.operand)
-+        self.write(')')
-+
-+    def visit_Subscript(self, node):
-+        self.visit(node.value)
-+        self.write('[')
-+        self.visit(node.slice)
-+        self.write(']')
-+
-+    def visit_Slice(self, node):
-+        if node.lower is not None:
-+            self.visit(node.lower)
-+        self.write(':')
-+        if node.upper is not None:
-+            self.visit(node.upper)
-+        if node.step is not None:
-+            self.write(':')
-+            if not (isinstance(node.step, Name) and node.step.id == 'None'):
-+                self.visit(node.step)
-+
-+    def visit_ExtSlice(self, node):
-+        for idx, item in node.dims:
-+            if idx:
-+                self.write(', ')
-+            self.visit(item)
-+
-+    def visit_Yield(self, node):
-+        self.write('yield ')
-+        self.visit(node.value)
-+
-+    def visit_Lambda(self, node):
-+        self.write('lambda ')
-+        self.signature(node.args)
-+        self.write(': ')
-+        self.visit(node.body)
-+
-+    def visit_Ellipsis(self, node):
-+        self.write('Ellipsis')
-+
-+    def generator_visit(left, right):
-+        def visit(self, node):
-+            self.write(left)
-+            self.visit(node.elt)
-+            for comprehension in node.generators:
-+                self.visit(comprehension)
-+            self.write(right)
-+        return visit
-+
-+    visit_ListComp = generator_visit('[', ']')
-+    visit_GeneratorExp = generator_visit('(', ')')
-+    visit_SetComp = generator_visit('{', '}')
-+    del generator_visit
-+
-+    def visit_DictComp(self, node):
-+        self.write('{')
-+        self.visit(node.key)
-+        self.write(': ')
-+        self.visit(node.value)
-+        for comprehension in node.generators:
-+            self.visit(comprehension)
-+        self.write('}')
-+
-+    def visit_IfExp(self, node):
-+        self.visit(node.body)
-+        self.write(' if ')
-+        self.visit(node.test)
-+        self.write(' else ')
-+        self.visit(node.orelse)
-+
-+    def visit_Starred(self, node):
-+        self.write('*')
-+        self.visit(node.value)
-+
-+    def visit_Repr(self, node):
-+        # XXX: python 2.6 only
-+        self.write('`')
-+        self.visit(node.value)
-+        self.write('`')
-+
-+    # Helper Nodes
-+
-+    def visit_alias(self, node):
-+        self.write(node.name)
-+        if node.asname is not None:
-+            self.write(' as ' + node.asname)
-+
-+    def visit_comprehension(self, node):
-+        self.write(' for ')
-+        self.visit(node.target)
-+        self.write(' in ')
-+        self.visit(node.iter)
-+        if node.ifs:
-+            for if_ in node.ifs:
-+                self.write(' if ')
-+                self.visit(if_)
-+
-+    def visit_excepthandler(self, node):
-+        self.newline()
-+        self.write('except')
-+        if node.type is not None:
-+            self.write(' ')
-+            self.visit(node.type)
-+            if node.name is not None:
-+                self.write(' as ')
-+                self.visit(node.name)
-+        self.write(':')
-+        self.body(node.body)
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py
-new file mode 100644
-index 0000000..65fd84d
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py
-@@ -0,0 +1,178 @@
-+# mako/ast.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""utilities for analyzing expressions and blocks of Python
-+code, as well as generating Python from AST nodes"""
-+
-+from mako import exceptions, pyparser, compat
-+import re
-+
-+class PythonCode(object):
-+    """represents information about a string containing Python code"""
-+    def __init__(self, code, **exception_kwargs):
-+        self.code = code
-+
-+        # represents all identifiers which are assigned to at some point in
-+        # the code
-+        self.declared_identifiers = set()
-+
-+        # represents all identifiers which are referenced before their
-+        # assignment, if any
-+        self.undeclared_identifiers = set()
-+
-+        # note that an identifier can be in both the undeclared and declared
-+        # lists.
-+
-+        # using AST to parse instead of using code.co_varnames,
-+        # code.co_names has several advantages:
-+        # - we can locate an identifier as "undeclared" even if
-+        # its declared later in the same block of code
-+        # - AST is less likely to break with version changes
-+        # (for example, the behavior of co_names changed a little bit
-+        # in python version 2.5)
-+        if isinstance(code, compat.string_types):
-+            expr = pyparser.parse(code.lstrip(), "exec", **exception_kwargs)
-+        else:
-+            expr = code
-+
-+        f = pyparser.FindIdentifiers(self, **exception_kwargs)
-+        f.visit(expr)
-+
-+class ArgumentList(object):
-+    """parses a fragment of code as a comma-separated list of expressions"""
-+    def __init__(self, code, **exception_kwargs):
-+        self.codeargs = []
-+        self.args = []
-+        self.declared_identifiers = set()
-+        self.undeclared_identifiers = set()
-+        if isinstance(code, compat.string_types):
-+            if re.match(r"\S", code) and not re.match(r",\s*$", code):
-+                # if theres text and no trailing comma, insure its parsed
-+                # as a tuple by adding a trailing comma
-+                code  += ","
-+            expr = pyparser.parse(code, "exec", **exception_kwargs)
-+        else:
-+            expr = code
-+
-+        f = pyparser.FindTuple(self, PythonCode, **exception_kwargs)
-+        f.visit(expr)
-+
-+class PythonFragment(PythonCode):
-+    """extends PythonCode to provide identifier lookups in partial control
-+    statements
-+
-+    e.g.
-+        for x in 5:
-+        elif y==9:
-+        except (MyException, e):
-+    etc.
-+    """
-+    def __init__(self, code, **exception_kwargs):
-+        m = re.match(r'^(\w+)(?:\s+(.*?))?:\s*(#|$)', code.strip(), re.S)
-+        if not m:
-+            raise exceptions.CompileException(
-+                          "Fragment '%s' is not a partial control statement" %
-+                          code, **exception_kwargs)
-+        if m.group(3):
-+            code = code[:m.start(3)]
-+        (keyword, expr) = m.group(1,2)
-+        if keyword in ['for','if', 'while']:
-+            code = code + "pass"
-+        elif keyword == 'try':
-+            code = code + "pass\nexcept:pass"
-+        elif keyword == 'elif' or keyword == 'else':
-+            code = "if False:pass\n" + code + "pass"
-+        elif keyword == 'except':
-+            code = "try:pass\n" + code + "pass"
-+        elif keyword == 'with':
-+            code = code + "pass"
-+        else:
-+            raise exceptions.CompileException(
-+                                "Unsupported control keyword: '%s'" %
-+                                keyword, **exception_kwargs)
-+        super(PythonFragment, self).__init__(code, **exception_kwargs)
-+
-+
-+class FunctionDecl(object):
-+    """function declaration"""
-+    def __init__(self, code, allow_kwargs=True, **exception_kwargs):
-+        self.code = code
-+        expr = pyparser.parse(code, "exec", **exception_kwargs)
-+
-+        f = pyparser.ParseFunc(self, **exception_kwargs)
-+        f.visit(expr)
-+        if not hasattr(self, 'funcname'):
-+            raise exceptions.CompileException(
-+                            "Code '%s' is not a function declaration" % code,
-+                            **exception_kwargs)
-+        if not allow_kwargs and self.kwargs:
-+            raise exceptions.CompileException(
-+                                "'**%s' keyword argument not allowed here" %
-+                                self.kwargnames[-1], **exception_kwargs)
-+
-+    def get_argument_expressions(self, as_call=False):
-+        """Return the argument declarations of this FunctionDecl as a printable
-+        list.
-+
-+        By default the return value is appropriate for writing in a ``def``;
-+        set `as_call` to true to build arguments to be passed to the function
-+        instead (assuming locals with the same names as the arguments exist).
-+        """
-+
-+        namedecls = []
-+
-+        # Build in reverse order, since defaults and slurpy args come last
-+        argnames = self.argnames[::-1]
-+        kwargnames = self.kwargnames[::-1]
-+        defaults = self.defaults[::-1]
-+        kwdefaults = self.kwdefaults[::-1]
-+
-+        # Named arguments
-+        if self.kwargs:
-+            namedecls.append("**" + kwargnames.pop(0))
-+
-+        for name in kwargnames:
-+            # Keyword-only arguments must always be used by name, so even if
-+            # this is a call, print out `foo=foo`
-+            if as_call:
-+                namedecls.append("%s=%s" % (name, name))
-+            elif kwdefaults:
-+                default = kwdefaults.pop(0)
-+                if default is None:
-+                    # The AST always gives kwargs a default, since you can do
-+                    # `def foo(*, a=1, b, c=3)`
-+                    namedecls.append(name)
-+                else:
-+                    namedecls.append("%s=%s" % (
-+                        name, pyparser.ExpressionGenerator(default).value()))
-+            else:
-+                namedecls.append(name)
-+
-+        # Positional arguments
-+        if self.varargs:
-+            namedecls.append("*" + argnames.pop(0))
-+
-+        for name in argnames:
-+            if as_call or not defaults:
-+                namedecls.append(name)
-+            else:
-+                default = defaults.pop(0)
-+                namedecls.append("%s=%s" % (
-+                    name, pyparser.ExpressionGenerator(default).value()))
-+
-+        namedecls.reverse()
-+        return namedecls
-+
-+    @property
-+    def allargnames(self):
-+        return tuple(self.argnames) + tuple(self.kwargnames)
-+
-+class FunctionArgs(FunctionDecl):
-+    """the argument portion of a function declaration"""
-+
-+    def __init__(self, code, **kwargs):
-+        super(FunctionArgs, self).__init__("def ANON(%s):pass" % code,
-+                **kwargs)
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py
-new file mode 100644
-index 0000000..c405c51
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py
-@@ -0,0 +1,238 @@
-+# mako/cache.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+from mako import compat, util
-+
-+_cache_plugins = util.PluginLoader("mako.cache")
-+
-+register_plugin = _cache_plugins.register
-+register_plugin("beaker", "mako.ext.beaker_cache", "BeakerCacheImpl")
-+
-+
-+class Cache(object):
-+    """Represents a data content cache made available to the module
-+    space of a specific :class:`.Template` object.
-+
-+    .. versionadded:: 0.6
-+       :class:`.Cache` by itself is mostly a
-+       container for a :class:`.CacheImpl` object, which implements
-+       a fixed API to provide caching services; specific subclasses exist to
-+       implement different
-+       caching strategies.   Mako includes a backend that works with
-+       the Beaker caching system.   Beaker itself then supports
-+       a number of backends (i.e. file, memory, memcached, etc.)
-+
-+    The construction of a :class:`.Cache` is part of the mechanics
-+    of a :class:`.Template`, and programmatic access to this
-+    cache is typically via the :attr:`.Template.cache` attribute.
-+
-+    """
-+
-+    impl = None
-+    """Provide the :class:`.CacheImpl` in use by this :class:`.Cache`.
-+
-+    This accessor allows a :class:`.CacheImpl` with additional
-+    methods beyond that of :class:`.Cache` to be used programmatically.
-+
-+    """
-+
-+    id = None
-+    """Return the 'id' that identifies this cache.
-+
-+    This is a value that should be globally unique to the
-+    :class:`.Template` associated with this cache, and can
-+    be used by a caching system to name a local container
-+    for data specific to this template.
-+
-+    """
-+
-+    starttime = None
-+    """Epochal time value for when the owning :class:`.Template` was
-+    first compiled.
-+
-+    A cache implementation may wish to invalidate data earlier than
-+    this timestamp; this has the effect of the cache for a specific
-+    :class:`.Template` starting clean any time the :class:`.Template`
-+    is recompiled, such as when the original template file changed on
-+    the filesystem.
-+
-+    """
-+
-+    def __init__(self, template, *args):
-+        # check for a stale template calling the
-+        # constructor
-+        if isinstance(template, compat.string_types) and args:
-+            return
-+        self.template = template
-+        self.id = template.module.__name__
-+        self.starttime = template.module._modified_time
-+        self._def_regions = {}
-+        self.impl = self._load_impl(self.template.cache_impl)
-+
-+    def _load_impl(self, name):
-+        return _cache_plugins.load(name)(self)
-+
-+    def get_or_create(self, key, creation_function, **kw):
-+        """Retrieve a value from the cache, using the given creation function
-+        to generate a new value."""
-+
-+        return self._ctx_get_or_create(key, creation_function, None, **kw)
-+
-+    def _ctx_get_or_create(self, key, creation_function, context, **kw):
-+        """Retrieve a value from the cache, using the given creation function
-+        to generate a new value."""
-+
-+        if not self.template.cache_enabled:
-+            return creation_function()
-+
-+        return self.impl.get_or_create(
-+            key,
-+            creation_function,
-+            **self._get_cache_kw(kw, context))
-+
-+    def set(self, key, value, **kw):
-+        """Place a value in the cache.
-+
-+        :param key: the value's key.
-+        :param value: the value.
-+        :param \**kw: cache configuration arguments.
-+
-+        """
-+
-+        self.impl.set(key, value, **self._get_cache_kw(kw, None))
-+
-+    put = set
-+    """A synonym for :meth:`.Cache.set`.
-+
-+    This is here for backwards compatibility.
-+
-+    """
-+
-+    def get(self, key, **kw):
-+        """Retrieve a value from the cache.
-+
-+        :param key: the value's key.
-+        :param \**kw: cache configuration arguments.  The
-+         backend is configured using these arguments upon first request.
-+         Subsequent requests that use the same series of configuration
-+         values will use that same backend.
-+
-+        """
-+        return self.impl.get(key, **self._get_cache_kw(kw, None))
-+
-+    def invalidate(self, key, **kw):
-+        """Invalidate a value in the cache.
-+
-+        :param key: the value's key.
-+        :param \**kw: cache configuration arguments.  The
-+         backend is configured using these arguments upon first request.
-+         Subsequent requests that use the same series of configuration
-+         values will use that same backend.
-+
-+        """
-+        self.impl.invalidate(key, **self._get_cache_kw(kw, None))
-+
-+    def invalidate_body(self):
-+        """Invalidate the cached content of the "body" method for this
-+        template.
-+
-+        """
-+        self.invalidate('render_body', __M_defname='render_body')
-+
-+    def invalidate_def(self, name):
-+        """Invalidate the cached content of a particular ``<%def>`` within this
-+        template.
-+
-+        """
-+
-+        self.invalidate('render_%s' % name, __M_defname='render_%s' % name)
-+
-+    def invalidate_closure(self, name):
-+        """Invalidate a nested ``<%def>`` within this template.
-+
-+        Caching of nested defs is a blunt tool as there is no
-+        management of scope -- nested defs that use cache tags
-+        need to have names unique of all other nested defs in the
-+        template, else their content will be overwritten by
-+        each other.
-+
-+        """
-+
-+        self.invalidate(name, __M_defname=name)
-+
-+    def _get_cache_kw(self, kw, context):
-+        defname = kw.pop('__M_defname', None)
-+        if not defname:
-+            tmpl_kw = self.template.cache_args.copy()
-+            tmpl_kw.update(kw)
-+        elif defname in self._def_regions:
-+            tmpl_kw = self._def_regions[defname]
-+        else:
-+            tmpl_kw = self.template.cache_args.copy()
-+            tmpl_kw.update(kw)
-+            self._def_regions[defname] = tmpl_kw
-+        if context and self.impl.pass_context:
-+            tmpl_kw = tmpl_kw.copy()
-+            tmpl_kw.setdefault('context', context)
-+        return tmpl_kw
-+
-+
-+class CacheImpl(object):
-+    """Provide a cache implementation for use by :class:`.Cache`."""
-+
-+    def __init__(self, cache):
-+        self.cache = cache
-+
-+    pass_context = False
-+    """If ``True``, the :class:`.Context` will be passed to
-+    :meth:`get_or_create <.CacheImpl.get_or_create>` as the name ``'context'``.
-+    """
-+
-+    def get_or_create(self, key, creation_function, **kw):
-+        """Retrieve a value from the cache, using the given creation function
-+        to generate a new value.
-+
-+        This function *must* return a value, either from
-+        the cache, or via the given creation function.
-+        If the creation function is called, the newly
-+        created value should be populated into the cache
-+        under the given key before being returned.
-+
-+        :param key: the value's key.
-+        :param creation_function: function that when called generates
-+         a new value.
-+        :param \**kw: cache configuration arguments.
-+
-+        """
-+        raise NotImplementedError()
-+
-+    def set(self, key, value, **kw):
-+        """Place a value in the cache.
-+
-+        :param key: the value's key.
-+        :param value: the value.
-+        :param \**kw: cache configuration arguments.
-+
-+        """
-+        raise NotImplementedError()
-+
-+    def get(self, key, **kw):
-+        """Retrieve a value from the cache.
-+
-+        :param key: the value's key.
-+        :param \**kw: cache configuration arguments.
-+
-+        """
-+        raise NotImplementedError()
-+
-+    def invalidate(self, key, **kw):
-+        """Invalidate a value in the cache.
-+
-+        :param key: the value's key.
-+        :param \**kw: cache configuration arguments.
-+
-+        """
-+        raise NotImplementedError()
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py
-new file mode 100644
-index 0000000..1a9ca56
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py
-@@ -0,0 +1,62 @@
-+# mako/cmd.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+from argparse import ArgumentParser
-+from os.path import isfile, dirname
-+import sys
-+from mako.template import Template
-+from mako.lookup import TemplateLookup
-+from mako import exceptions
-+
-+def varsplit(var):
-+    if "=" not in var:
-+        return (var, "")
-+    return var.split("=", 1)
-+
-+def _exit():
-+    sys.stderr.write(exceptions.text_error_template().render())
-+    sys.exit(1)
-+
-+def cmdline(argv=None):
-+
-+    parser = ArgumentParser("usage: %prog [FILENAME]")
-+    parser.add_argument("--var", default=[], action="append",
-+                  help="variable (can be used multiple times, use name=value)")
-+    parser.add_argument("--template-dir", default=[], action="append",
-+                  help="Directory to use for template lookup (multiple "
-+                    "directories may be provided). If not given then if the "
-+                    "template is read from stdin, the value defaults to be "
-+                    "the current directory, otherwise it defaults to be the "
-+                    "parent directory of the file provided.")
-+    parser.add_argument('input', nargs='?', default='-')
-+
-+    options = parser.parse_args(argv)
-+    if options.input == '-':
-+        lookup_dirs = options.template_dir or ["."]
-+        lookup = TemplateLookup(lookup_dirs)
-+        try:
-+            template = Template(sys.stdin.read(), lookup=lookup)
-+        except:
-+            _exit()
-+    else:
-+        filename = options.input
-+        if not isfile(filename):
-+            raise SystemExit("error: can't find %s" % filename)
-+        lookup_dirs = options.template_dir or [dirname(filename)]
-+        lookup = TemplateLookup(lookup_dirs)
-+        try:
-+            template = Template(filename=filename, lookup=lookup)
-+        except:
-+            _exit()
-+
-+    kw = dict([varsplit(var) for var in options.var])
-+    try:
-+        print(template.render(**kw))
-+    except:
-+        _exit()
-+
-+
-+if __name__ == "__main__":
-+    cmdline()
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py
-new file mode 100644
-index 0000000..4b0bda8
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py
-@@ -0,0 +1,1237 @@
-+# mako/codegen.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""provides functionality for rendering a parsetree constructing into module
-+source code."""
-+
-+import time
-+import re
-+from mako.pygen import PythonPrinter
-+from mako import util, ast, parsetree, filters, exceptions
-+from mako import compat
-+
-+
-+MAGIC_NUMBER = 10
-+
-+# names which are hardwired into the
-+# template and are not accessed via the
-+# context itself
-+RESERVED_NAMES = set(['context', 'loop', 'UNDEFINED'])
-+
-+def compile(node,
-+                uri,
-+                filename=None,
-+                default_filters=None,
-+                buffer_filters=None,
-+                imports=None,
-+                future_imports=None,
-+                source_encoding=None,
-+                generate_magic_comment=True,
-+                disable_unicode=False,
-+                strict_undefined=False,
-+                enable_loop=True,
-+                reserved_names=frozenset()):
-+
-+    """Generate module source code given a parsetree node,
-+      uri, and optional source filename"""
-+
-+    # if on Py2K, push the "source_encoding" string to be
-+    # a bytestring itself, as we will be embedding it into
-+    # the generated source and we don't want to coerce the
-+    # result into a unicode object, in "disable_unicode" mode
-+    if not compat.py3k and isinstance(source_encoding, compat.text_type):
-+        source_encoding = source_encoding.encode(source_encoding)
-+
-+
-+    buf = util.FastEncodingBuffer()
-+
-+    printer = PythonPrinter(buf)
-+    _GenerateRenderMethod(printer,
-+                            _CompileContext(uri,
-+                                            filename,
-+                                            default_filters,
-+                                            buffer_filters,
-+                                            imports,
-+                                            future_imports,
-+                                            source_encoding,
-+                                            generate_magic_comment,
-+                                            disable_unicode,
-+                                            strict_undefined,
-+                                            enable_loop,
-+                                            reserved_names),
-+                                node)
-+    return buf.getvalue()
-+
-+class _CompileContext(object):
-+    def __init__(self,
-+                    uri,
-+                    filename,
-+                    default_filters,
-+                    buffer_filters,
-+                    imports,
-+                    future_imports,
-+                    source_encoding,
-+                    generate_magic_comment,
-+                    disable_unicode,
-+                    strict_undefined,
-+                    enable_loop,
-+                    reserved_names):
-+        self.uri = uri
-+        self.filename = filename
-+        self.default_filters = default_filters
-+        self.buffer_filters = buffer_filters
-+        self.imports = imports
-+        self.future_imports = future_imports
-+        self.source_encoding = source_encoding
-+        self.generate_magic_comment = generate_magic_comment
-+        self.disable_unicode = disable_unicode
-+        self.strict_undefined = strict_undefined
-+        self.enable_loop = enable_loop
-+        self.reserved_names = reserved_names
-+
-+class _GenerateRenderMethod(object):
-+    """A template visitor object which generates the
-+       full module source for a template.
-+
-+    """
-+    def __init__(self, printer, compiler, node):
-+        self.printer = printer
-+        self.compiler = compiler
-+        self.node = node
-+        self.identifier_stack = [None]
-+        self.in_def = isinstance(node, (parsetree.DefTag, parsetree.BlockTag))
-+
-+        if self.in_def:
-+            name = "render_%s" % node.funcname
-+            args = node.get_argument_expressions()
-+            filtered = len(node.filter_args.args) > 0
-+            buffered = eval(node.attributes.get('buffered', 'False'))
-+            cached = eval(node.attributes.get('cached', 'False'))
-+            defs = None
-+            pagetag = None
-+            if node.is_block and not node.is_anonymous:
-+                args += ['**pageargs']
-+        else:
-+            defs = self.write_toplevel()
-+            pagetag = self.compiler.pagetag
-+            name = "render_body"
-+            if pagetag is not None:
-+                args = pagetag.body_decl.get_argument_expressions()
-+                if not pagetag.body_decl.kwargs:
-+                    args += ['**pageargs']
-+                cached = eval(pagetag.attributes.get('cached', 'False'))
-+                self.compiler.enable_loop = self.compiler.enable_loop or eval(
-+                                        pagetag.attributes.get(
-+                                                'enable_loop', 'False')
-+                                    )
-+            else:
-+                args = ['**pageargs']
-+                cached = False
-+            buffered = filtered = False
-+        if args is None:
-+            args = ['context']
-+        else:
-+            args = [a for a in ['context'] + args]
-+
-+        self.write_render_callable(
-+                            pagetag or node,
-+                            name, args,
-+                            buffered, filtered, cached)
-+
-+        if defs is not None:
-+            for node in defs:
-+                _GenerateRenderMethod(printer, compiler, node)
-+
-+        if not self.in_def:
-+            self.write_metadata_struct()
-+
-+    def write_metadata_struct(self):
-+        self.printer.source_map[self.printer.lineno] = \
-+                    max(self.printer.source_map)
-+        struct = {
-+            "filename": self.compiler.filename,
-+            "uri": self.compiler.uri,
-+            "source_encoding": self.compiler.source_encoding,
-+            "line_map": self.printer.source_map,
-+        }
-+        self.printer.writelines(
-+            '"""',
-+            '__M_BEGIN_METADATA',
-+            compat.json.dumps(struct),
-+            '__M_END_METADATA\n'
-+            '"""'
-+        )
-+
-+    @property
-+    def identifiers(self):
-+        return self.identifier_stack[-1]
-+
-+    def write_toplevel(self):
-+        """Traverse a template structure for module-level directives and
-+        generate the start of module-level code.
-+
-+        """
-+        inherit = []
-+        namespaces = {}
-+        module_code = []
-+
-+        self.compiler.pagetag = None
-+
-+        class FindTopLevel(object):
-+            def visitInheritTag(s, node):
-+                inherit.append(node)
-+            def visitNamespaceTag(s, node):
-+                namespaces[node.name] = node
-+            def visitPageTag(s, node):
-+                self.compiler.pagetag = node
-+            def visitCode(s, node):
-+                if node.ismodule:
-+                    module_code.append(node)
-+
-+        f = FindTopLevel()
-+        for n in self.node.nodes:
-+            n.accept_visitor(f)
-+
-+        self.compiler.namespaces = namespaces
-+
-+        module_ident = set()
-+        for n in module_code:
-+            module_ident = module_ident.union(n.declared_identifiers())
-+
-+        module_identifiers = _Identifiers(self.compiler)
-+        module_identifiers.declared = module_ident
-+
-+        # module-level names, python code
-+        if self.compiler.generate_magic_comment and \
-+                self.compiler.source_encoding:
-+            self.printer.writeline("# -*- coding:%s -*-" %
-+                                    self.compiler.source_encoding)
-+
-+        if self.compiler.future_imports:
-+            self.printer.writeline("from __future__ import %s" %
-+                                   (", ".join(self.compiler.future_imports),))
-+        self.printer.writeline("from mako import runtime, filters, cache")
-+        self.printer.writeline("UNDEFINED = runtime.UNDEFINED")
-+        self.printer.writeline("__M_dict_builtin = dict")
-+        self.printer.writeline("__M_locals_builtin = locals")
-+        self.printer.writeline("_magic_number = %r" % MAGIC_NUMBER)
-+        self.printer.writeline("_modified_time = %r" % time.time())
-+        self.printer.writeline("_enable_loop = %r" % self.compiler.enable_loop)
-+        self.printer.writeline(
-+                            "_template_filename = %r" % self.compiler.filename)
-+        self.printer.writeline("_template_uri = %r" % self.compiler.uri)
-+        self.printer.writeline(
-+                    "_source_encoding = %r" % self.compiler.source_encoding)
-+        if self.compiler.imports:
-+            buf = ''
-+            for imp in self.compiler.imports:
-+                buf += imp + "\n"
-+                self.printer.writeline(imp)
-+            impcode = ast.PythonCode(
-+                            buf,
-+                            source='', lineno=0,
-+                            pos=0,
-+                            filename='template defined imports')
-+        else:
-+            impcode = None
-+
-+        main_identifiers = module_identifiers.branch(self.node)
-+        module_identifiers.topleveldefs = \
-+            module_identifiers.topleveldefs.\
-+                union(main_identifiers.topleveldefs)
-+        module_identifiers.declared.add("UNDEFINED")
-+        if impcode:
-+            module_identifiers.declared.update(impcode.declared_identifiers)
-+
-+        self.compiler.identifiers = module_identifiers
-+        self.printer.writeline("_exports = %r" %
-+                            [n.name for n in
-+                            main_identifiers.topleveldefs.values()]
-+                        )
-+        self.printer.write_blanks(2)
-+
-+        if len(module_code):
-+            self.write_module_code(module_code)
-+
-+        if len(inherit):
-+            self.write_namespaces(namespaces)
-+            self.write_inherit(inherit[-1])
-+        elif len(namespaces):
-+            self.write_namespaces(namespaces)
-+
-+        return list(main_identifiers.topleveldefs.values())
-+
-+    def write_render_callable(self, node, name, args, buffered, filtered,
-+            cached):
-+        """write a top-level render callable.
-+
-+        this could be the main render() method or that of a top-level def."""
-+
-+        if self.in_def:
-+            decorator = node.decorator
-+            if decorator:
-+                self.printer.writeline(
-+                                "@runtime._decorate_toplevel(%s)" % decorator)
-+
-+        self.printer.start_source(node.lineno)
-+        self.printer.writelines(
-+            "def %s(%s):" % (name, ','.join(args)),
-+                # push new frame, assign current frame to __M_caller
-+                "__M_caller = context.caller_stack._push_frame()",
-+                "try:"
-+        )
-+        if buffered or filtered or cached:
-+            self.printer.writeline("context._push_buffer()")
-+
-+        self.identifier_stack.append(
-+                                self.compiler.identifiers.branch(self.node))
-+        if (not self.in_def or self.node.is_block) and '**pageargs' in args:
-+            self.identifier_stack[-1].argument_declared.add('pageargs')
-+
-+        if not self.in_def and (
-+                                len(self.identifiers.locally_assigned) > 0 or
-+                                len(self.identifiers.argument_declared) > 0
-+                                ):
-+            self.printer.writeline("__M_locals = __M_dict_builtin(%s)" %
-+                                    ','.join([
-+                                            "%s=%s" % (x, x) for x in
-+                                            self.identifiers.argument_declared
-+                                            ]))
-+
-+        self.write_variable_declares(self.identifiers, toplevel=True)
-+
-+        for n in self.node.nodes:
-+            n.accept_visitor(self)
-+
-+        self.write_def_finish(self.node, buffered, filtered, cached)
-+        self.printer.writeline(None)
-+        self.printer.write_blanks(2)
-+        if cached:
-+            self.write_cache_decorator(
-+                                node, name,
-+                                args, buffered,
-+                                self.identifiers, toplevel=True)
-+
-+    def write_module_code(self, module_code):
-+        """write module-level template code, i.e. that which
-+        is enclosed in <%! %> tags in the template."""
-+        for n in module_code:
-+            self.printer.start_source(n.lineno)
-+            self.printer.write_indented_block(n.text)
-+
-+    def write_inherit(self, node):
-+        """write the module-level inheritance-determination callable."""
-+
-+        self.printer.writelines(
-+            "def _mako_inherit(template, context):",
-+                "_mako_generate_namespaces(context)",
-+                "return runtime._inherit_from(context, %s, _template_uri)" %
-+                (node.parsed_attributes['file']),
-+                None
-+        )
-+
-+    def write_namespaces(self, namespaces):
-+        """write the module-level namespace-generating callable."""
-+        self.printer.writelines(
-+            "def _mako_get_namespace(context, name):",
-+                "try:",
-+                    "return context.namespaces[(__name__, name)]",
-+                "except KeyError:",
-+                    "_mako_generate_namespaces(context)",
-+                "return context.namespaces[(__name__, name)]",
-+            None, None
-+        )
-+        self.printer.writeline("def _mako_generate_namespaces(context):")
-+
-+
-+        for node in namespaces.values():
-+            if 'import' in node.attributes:
-+                self.compiler.has_ns_imports = True
-+            self.printer.start_source(node.lineno)
-+            if len(node.nodes):
-+                self.printer.writeline("def make_namespace():")
-+                export = []
-+                identifiers = self.compiler.identifiers.branch(node)
-+                self.in_def = True
-+                class NSDefVisitor(object):
-+                    def visitDefTag(s, node):
-+                        s.visitDefOrBase(node)
-+
-+                    def visitBlockTag(s, node):
-+                        s.visitDefOrBase(node)
-+
-+                    def visitDefOrBase(s, node):
-+                        if node.is_anonymous:
-+                            raise exceptions.CompileException(
-+                                "Can't put anonymous blocks inside "
-+                                "<%namespace>",
-+                                **node.exception_kwargs
-+                            )
-+                        self.write_inline_def(node, identifiers, nested=False)
-+                        export.append(node.funcname)
-+                vis = NSDefVisitor()
-+                for n in node.nodes:
-+                    n.accept_visitor(vis)
-+                self.printer.writeline("return [%s]" % (','.join(export)))
-+                self.printer.writeline(None)
-+                self.in_def = False
-+                callable_name = "make_namespace()"
-+            else:
-+                callable_name = "None"
-+
-+            if 'file' in node.parsed_attributes:
-+                self.printer.writeline(
-+                                "ns = runtime.TemplateNamespace(%r,"
-+                                " context._clean_inheritance_tokens(),"
-+                                " templateuri=%s, callables=%s, "
-+                                " calling_uri=_template_uri)" %
-+                                (
-+                                    node.name,
-+                                    node.parsed_attributes.get('file', 'None'),
-+                                    callable_name,
-+                                )
-+                            )
-+            elif 'module' in node.parsed_attributes:
-+                self.printer.writeline(
-+                                "ns = runtime.ModuleNamespace(%r,"
-+                                " context._clean_inheritance_tokens(),"
-+                                " callables=%s, calling_uri=_template_uri,"
-+                                " module=%s)" %
-+                                (
-+                                    node.name,
-+                                    callable_name,
-+                                    node.parsed_attributes.get(
-+                                                'module', 'None')
-+                                )
-+                            )
-+            else:
-+                self.printer.writeline(
-+                                "ns = runtime.Namespace(%r,"
-+                                " context._clean_inheritance_tokens(),"
-+                                " callables=%s, calling_uri=_template_uri)" %
-+                                (
-+                                    node.name,
-+                                    callable_name,
-+                                )
-+                            )
-+            if eval(node.attributes.get('inheritable', "False")):
-+                self.printer.writeline("context['self'].%s = ns" % (node.name))
-+
-+            self.printer.writeline(
-+                "context.namespaces[(__name__, %s)] = ns" % repr(node.name))
-+            self.printer.write_blanks(1)
-+        if not len(namespaces):
-+            self.printer.writeline("pass")
-+        self.printer.writeline(None)
-+
-+    def write_variable_declares(self, identifiers, toplevel=False, limit=None):
-+        """write variable declarations at the top of a function.
-+
-+        the variable declarations are in the form of callable
-+        definitions for defs and/or name lookup within the
-+        function's context argument. the names declared are based
-+        on the names that are referenced in the function body,
-+        which don't otherwise have any explicit assignment
-+        operation. names that are assigned within the body are
-+        assumed to be locally-scoped variables and are not
-+        separately declared.
-+
-+        for def callable definitions, if the def is a top-level
-+        callable then a 'stub' callable is generated which wraps
-+        the current Context into a closure. if the def is not
-+        top-level, it is fully rendered as a local closure.
-+
-+        """
-+
-+        # collection of all defs available to us in this scope
-+        comp_idents = dict([(c.funcname, c) for c in identifiers.defs])
-+        to_write = set()
-+
-+        # write "context.get()" for all variables we are going to
-+        # need that arent in the namespace yet
-+        to_write = to_write.union(identifiers.undeclared)
-+
-+        # write closure functions for closures that we define
-+        # right here
-+        to_write = to_write.union(
-+                        [c.funcname for c in identifiers.closuredefs.values()])
-+
-+        # remove identifiers that are declared in the argument
-+        # signature of the callable
-+        to_write = to_write.difference(identifiers.argument_declared)
-+
-+        # remove identifiers that we are going to assign to.
-+        # in this way we mimic Python's behavior,
-+        # i.e. assignment to a variable within a block
-+        # means that variable is now a "locally declared" var,
-+        # which cannot be referenced beforehand.
-+        to_write = to_write.difference(identifiers.locally_declared)
-+
-+        if self.compiler.enable_loop:
-+            has_loop = "loop" in to_write
-+            to_write.discard("loop")
-+        else:
-+            has_loop = False
-+
-+        # if a limiting set was sent, constraint to those items in that list
-+        # (this is used for the caching decorator)
-+        if limit is not None:
-+            to_write = to_write.intersection(limit)
-+
-+        if toplevel and getattr(self.compiler, 'has_ns_imports', False):
-+            self.printer.writeline("_import_ns = {}")
-+            self.compiler.has_imports = True
-+            for ident, ns in self.compiler.namespaces.items():
-+                if 'import' in ns.attributes:
-+                    self.printer.writeline(
-+                            "_mako_get_namespace(context, %r)."
-+                                    "_populate(_import_ns, %r)" %
-+                            (
-+                                ident,
-+                                re.split(r'\s*,\s*', ns.attributes['import'])
-+                            ))
-+
-+        if has_loop:
-+            self.printer.writeline(
-+                'loop = __M_loop = runtime.LoopStack()'
-+            )
-+
-+        for ident in to_write:
-+            if ident in comp_idents:
-+                comp = comp_idents[ident]
-+                if comp.is_block:
-+                    if not comp.is_anonymous:
-+                        self.write_def_decl(comp, identifiers)
-+                    else:
-+                        self.write_inline_def(comp, identifiers, nested=True)
-+                else:
-+                    if comp.is_root():
-+                        self.write_def_decl(comp, identifiers)
-+                    else:
-+                        self.write_inline_def(comp, identifiers, nested=True)
-+
-+            elif ident in self.compiler.namespaces:
-+                self.printer.writeline(
-+                            "%s = _mako_get_namespace(context, %r)" %
-+                                (ident, ident)
-+                            )
-+            else:
-+                if getattr(self.compiler, 'has_ns_imports', False):
-+                    if self.compiler.strict_undefined:
-+                        self.printer.writelines(
-+                        "%s = _import_ns.get(%r, UNDEFINED)" %
-+                        (ident, ident),
-+                        "if %s is UNDEFINED:" % ident,
-+                            "try:",
-+                                "%s = context[%r]" % (ident, ident),
-+                            "except KeyError:",
-+                                "raise NameError(\"'%s' is not defined\")" %
-+                                    ident,
-+                            None, None
-+                        )
-+                    else:
-+                        self.printer.writeline(
-+                        "%s = _import_ns.get(%r, context.get(%r, UNDEFINED))" %
-+                        (ident, ident, ident))
-+                else:
-+                    if self.compiler.strict_undefined:
-+                        self.printer.writelines(
-+                            "try:",
-+                                "%s = context[%r]" % (ident, ident),
-+                            "except KeyError:",
-+                                "raise NameError(\"'%s' is not defined\")" %
-+                                    ident,
-+                            None
-+                        )
-+                    else:
-+                        self.printer.writeline(
-+                            "%s = context.get(%r, UNDEFINED)" % (ident, ident)
-+                        )
-+
-+        self.printer.writeline("__M_writer = context.writer()")
-+
-+    def write_def_decl(self, node, identifiers):
-+        """write a locally-available callable referencing a top-level def"""
-+        funcname = node.funcname
-+        namedecls = node.get_argument_expressions()
-+        nameargs = node.get_argument_expressions(as_call=True)
-+
-+        if not self.in_def and (
-+                                len(self.identifiers.locally_assigned) > 0 or
-+                                len(self.identifiers.argument_declared) > 0):
-+            nameargs.insert(0, 'context._locals(__M_locals)')
-+        else:
-+            nameargs.insert(0, 'context')
-+        self.printer.writeline("def %s(%s):" % (funcname, ",".join(namedecls)))
-+        self.printer.writeline(
-+                    "return render_%s(%s)" % (funcname, ",".join(nameargs)))
-+        self.printer.writeline(None)
-+
-+    def write_inline_def(self, node, identifiers, nested):
-+        """write a locally-available def callable inside an enclosing def."""
-+
-+        namedecls = node.get_argument_expressions()
-+
-+        decorator = node.decorator
-+        if decorator:
-+            self.printer.writeline(
-+                        "@runtime._decorate_inline(context, %s)" % decorator)
-+        self.printer.writeline(
-+                        "def %s(%s):" % (node.funcname, ",".join(namedecls)))
-+        filtered = len(node.filter_args.args) > 0
-+        buffered = eval(node.attributes.get('buffered', 'False'))
-+        cached = eval(node.attributes.get('cached', 'False'))
-+        self.printer.writelines(
-+            # push new frame, assign current frame to __M_caller
-+            "__M_caller = context.caller_stack._push_frame()",
-+            "try:"
-+        )
-+        if buffered or filtered or cached:
-+            self.printer.writelines(
-+                "context._push_buffer()",
-+            )
-+
-+        identifiers = identifiers.branch(node, nested=nested)
-+
-+        self.write_variable_declares(identifiers)
-+
-+        self.identifier_stack.append(identifiers)
-+        for n in node.nodes:
-+            n.accept_visitor(self)
-+        self.identifier_stack.pop()
-+
-+        self.write_def_finish(node, buffered, filtered, cached)
-+        self.printer.writeline(None)
-+        if cached:
-+            self.write_cache_decorator(node, node.funcname,
-+                                        namedecls, False, identifiers,
-+                                        inline=True, toplevel=False)
-+
-+    def write_def_finish(self, node, buffered, filtered, cached,
-+            callstack=True):
-+        """write the end section of a rendering function, either outermost or
-+        inline.
-+
-+        this takes into account if the rendering function was filtered,
-+        buffered, etc.  and closes the corresponding try: block if any, and
-+        writes code to retrieve captured content, apply filters, send proper
-+        return value."""
-+
-+        if not buffered and not cached and not filtered:
-+            self.printer.writeline("return ''")
-+            if callstack:
-+                self.printer.writelines(
-+                    "finally:",
-+                        "context.caller_stack._pop_frame()",
-+                    None
-+                )
-+
-+        if buffered or filtered or cached:
-+            if buffered or cached:
-+                # in a caching scenario, don't try to get a writer
-+                # from the context after popping; assume the caching
-+                # implemenation might be using a context with no
-+                # extra buffers
-+                self.printer.writelines(
-+                    "finally:",
-+                        "__M_buf = context._pop_buffer()"
-+                )
-+            else:
-+                self.printer.writelines(
-+                    "finally:",
-+                    "__M_buf, __M_writer = context._pop_buffer_and_writer()"
-+                )
-+
-+            if callstack:
-+                self.printer.writeline("context.caller_stack._pop_frame()")
-+
-+            s = "__M_buf.getvalue()"
-+            if filtered:
-+                s = self.create_filter_callable(node.filter_args.args, s,
-+                                                False)
-+            self.printer.writeline(None)
-+            if buffered and not cached:
-+                s = self.create_filter_callable(self.compiler.buffer_filters,
-+                                                s, False)
-+            if buffered or cached:
-+                self.printer.writeline("return %s" % s)
-+            else:
-+                self.printer.writelines(
-+                    "__M_writer(%s)" % s,
-+                    "return ''"
-+                )
-+
-+    def write_cache_decorator(self, node_or_pagetag, name,
-+                                    args, buffered, identifiers,
-+                                    inline=False, toplevel=False):
-+        """write a post-function decorator to replace a rendering
-+            callable with a cached version of itself."""
-+
-+        self.printer.writeline("__M_%s = %s" % (name, name))
-+        cachekey = node_or_pagetag.parsed_attributes.get('cache_key',
-+                                                         repr(name))
-+
-+        cache_args = {}
-+        if self.compiler.pagetag is not None:
-+            cache_args.update(
-+                (
-+                    pa[6:],
-+                    self.compiler.pagetag.parsed_attributes[pa]
-+                )
-+                for pa in self.compiler.pagetag.parsed_attributes
-+                if pa.startswith('cache_') and pa != 'cache_key'
-+            )
-+        cache_args.update(
-+            (
-+                pa[6:],
-+                node_or_pagetag.parsed_attributes[pa]
-+            ) for pa in node_or_pagetag.parsed_attributes
-+            if pa.startswith('cache_') and pa != 'cache_key'
-+        )
-+        if 'timeout' in cache_args:
-+            cache_args['timeout'] = int(eval(cache_args['timeout']))
-+
-+        self.printer.writeline("def %s(%s):" % (name, ','.join(args)))
-+
-+        # form "arg1, arg2, arg3=arg3, arg4=arg4", etc.
-+        pass_args = [
-+                        "%s=%s" % ((a.split('=')[0],) * 2) if '=' in a else a
-+                        for a in args
-+                    ]
-+
-+        self.write_variable_declares(
-+                            identifiers,
-+                            toplevel=toplevel,
-+                            limit=node_or_pagetag.undeclared_identifiers()
-+                        )
-+        if buffered:
-+            s = "context.get('local')."\
-+                "cache._ctx_get_or_create("\
-+                "%s, lambda:__M_%s(%s),  context, %s__M_defname=%r)" % (
-+                                cachekey, name, ','.join(pass_args),
-+                                ''.join(["%s=%s, " % (k, v)
-+                                for k, v in cache_args.items()]),
-+                                name
-+                            )
-+            # apply buffer_filters
-+            s = self.create_filter_callable(self.compiler.buffer_filters, s,
-+                                            False)
-+            self.printer.writelines("return " + s, None)
-+        else:
-+            self.printer.writelines(
-+                    "__M_writer(context.get('local')."
-+                    "cache._ctx_get_or_create("
-+                    "%s, lambda:__M_%s(%s), context, %s__M_defname=%r))" %
-+                    (
-+                        cachekey, name, ','.join(pass_args),
-+                        ''.join(["%s=%s, " % (k, v)
-+                        for k, v in cache_args.items()]),
-+                        name,
-+                    ),
-+                    "return ''",
-+                None
-+            )
-+
-+    def create_filter_callable(self, args, target, is_expression):
-+        """write a filter-applying expression based on the filters
-+        present in the given filter names, adjusting for the global
-+        'default' filter aliases as needed."""
-+
-+        def locate_encode(name):
-+            if re.match(r'decode\..+', name):
-+                return "filters." + name
-+            elif self.compiler.disable_unicode:
-+                return filters.NON_UNICODE_ESCAPES.get(name, name)
-+            else:
-+                return filters.DEFAULT_ESCAPES.get(name, name)
-+
-+        if 'n' not in args:
-+            if is_expression:
-+                if self.compiler.pagetag:
-+                    args = self.compiler.pagetag.filter_args.args + args
-+                if self.compiler.default_filters:
-+                    args = self.compiler.default_filters + args
-+        for e in args:
-+            # if filter given as a function, get just the identifier portion
-+            if e == 'n':
-+                continue
-+            m = re.match(r'(.+?)(\(.*\))', e)
-+            if m:
-+                ident, fargs = m.group(1, 2)
-+                f = locate_encode(ident)
-+                e = f + fargs
-+            else:
-+                e = locate_encode(e)
-+                assert e is not None
-+            target = "%s(%s)" % (e, target)
-+        return target
-+
-+    def visitExpression(self, node):
-+        self.printer.start_source(node.lineno)
-+        if len(node.escapes) or \
-+                (
-+                    self.compiler.pagetag is not None and
-+                    len(self.compiler.pagetag.filter_args.args)
-+                ) or \
-+                len(self.compiler.default_filters):
-+
-+            s = self.create_filter_callable(node.escapes_code.args,
-+                                            "%s" % node.text, True)
-+            self.printer.writeline("__M_writer(%s)" % s)
-+        else:
-+            self.printer.writeline("__M_writer(%s)" % node.text)
-+
-+    def visitControlLine(self, node):
-+        if node.isend:
-+            self.printer.writeline(None)
-+            if node.has_loop_context:
-+                self.printer.writeline('finally:')
-+                self.printer.writeline("loop = __M_loop._exit()")
-+                self.printer.writeline(None)
-+        else:
-+            self.printer.start_source(node.lineno)
-+            if self.compiler.enable_loop and node.keyword == 'for':
-+                text = mangle_mako_loop(node, self.printer)
-+            else:
-+                text = node.text
-+            self.printer.writeline(text)
-+            children = node.get_children()
-+            # this covers the three situations where we want to insert a pass:
-+            #    1) a ternary control line with no children,
-+            #    2) a primary control line with nothing but its own ternary
-+            #          and end control lines, and
-+            #    3) any control line with no content other than comments
-+            if not children or (
-+                    compat.all(isinstance(c, (parsetree.Comment,
-+                                            parsetree.ControlLine))
-+                             for c in children) and
-+                    compat.all((node.is_ternary(c.keyword) or c.isend)
-+                             for c in children
-+                             if isinstance(c, parsetree.ControlLine))):
-+                self.printer.writeline("pass")
-+
-+    def visitText(self, node):
-+        self.printer.start_source(node.lineno)
-+        self.printer.writeline("__M_writer(%s)" % repr(node.content))
-+
-+    def visitTextTag(self, node):
-+        filtered = len(node.filter_args.args) > 0
-+        if filtered:
-+            self.printer.writelines(
-+                "__M_writer = context._push_writer()",
-+                "try:",
-+            )
-+        for n in node.nodes:
-+            n.accept_visitor(self)
-+        if filtered:
-+            self.printer.writelines(
-+                "finally:",
-+                "__M_buf, __M_writer = context._pop_buffer_and_writer()",
-+                "__M_writer(%s)" %
-+                self.create_filter_callable(
-+                                node.filter_args.args,
-+                                "__M_buf.getvalue()",
-+                                False),
-+                None
-+            )
-+
-+    def visitCode(self, node):
-+        if not node.ismodule:
-+            self.printer.start_source(node.lineno)
-+            self.printer.write_indented_block(node.text)
-+
-+            if not self.in_def and len(self.identifiers.locally_assigned) > 0:
-+                # if we are the "template" def, fudge locally
-+                # declared/modified variables into the "__M_locals" dictionary,
-+                # which is used for def calls within the same template,
-+                # to simulate "enclosing scope"
-+                self.printer.writeline(
-+                    '__M_locals_builtin_stored = __M_locals_builtin()')
-+                self.printer.writeline(
-+                    '__M_locals.update(__M_dict_builtin([(__M_key,'
-+                    ' __M_locals_builtin_stored[__M_key]) for __M_key in'
-+                    ' [%s] if __M_key in __M_locals_builtin_stored]))' %
-+                    ','.join([repr(x) for x in node.declared_identifiers()]))
-+
-+    def visitIncludeTag(self, node):
-+        self.printer.start_source(node.lineno)
-+        args = node.attributes.get('args')
-+        if args:
-+            self.printer.writeline(
-+                    "runtime._include_file(context, %s, _template_uri, %s)" %
-+                    (node.parsed_attributes['file'], args))
-+        else:
-+            self.printer.writeline(
-+                        "runtime._include_file(context, %s, _template_uri)" %
-+                        (node.parsed_attributes['file']))
-+
-+    def visitNamespaceTag(self, node):
-+        pass
-+
-+    def visitDefTag(self, node):
-+        pass
-+
-+    def visitBlockTag(self, node):
-+        if node.is_anonymous:
-+            self.printer.writeline("%s()" % node.funcname)
-+        else:
-+            nameargs = node.get_argument_expressions(as_call=True)
-+            nameargs += ['**pageargs']
-+            self.printer.writeline("if 'parent' not in context._data or "
-+                                  "not hasattr(context._data['parent'], '%s'):"
-+                                  % node.funcname)
-+            self.printer.writeline(
-+                "context['self'].%s(%s)" % (node.funcname, ",".join(nameargs)))
-+            self.printer.writeline("\n")
-+
-+    def visitCallNamespaceTag(self, node):
-+        # TODO: we can put namespace-specific checks here, such
-+        # as ensure the given namespace will be imported,
-+        # pre-import the namespace, etc.
-+        self.visitCallTag(node)
-+
-+    def visitCallTag(self, node):
-+        self.printer.writeline("def ccall(caller):")
-+        export = ['body']
-+        callable_identifiers = self.identifiers.branch(node, nested=True)
-+        body_identifiers = callable_identifiers.branch(node, nested=False)
-+        # we want the 'caller' passed to ccall to be used
-+        # for the body() function, but for other non-body()
-+        # <%def>s within <%call> we want the current caller
-+        # off the call stack (if any)
-+        body_identifiers.add_declared('caller')
-+
-+        self.identifier_stack.append(body_identifiers)
-+        class DefVisitor(object):
-+            def visitDefTag(s, node):
-+                s.visitDefOrBase(node)
-+
-+            def visitBlockTag(s, node):
-+                s.visitDefOrBase(node)
-+
-+            def visitDefOrBase(s, node):
-+                self.write_inline_def(node, callable_identifiers, nested=False)
-+                if not node.is_anonymous:
-+                    export.append(node.funcname)
-+                # remove defs that are within the <%call> from the
-+                # "closuredefs" defined in the body, so they dont render twice
-+                if node.funcname in body_identifiers.closuredefs:
-+                    del body_identifiers.closuredefs[node.funcname]
-+
-+        vis = DefVisitor()
-+        for n in node.nodes:
-+            n.accept_visitor(vis)
-+        self.identifier_stack.pop()
-+
-+        bodyargs = node.body_decl.get_argument_expressions()
-+        self.printer.writeline("def body(%s):" % ','.join(bodyargs))
-+
-+        # TODO: figure out best way to specify
-+        # buffering/nonbuffering (at call time would be better)
-+        buffered = False
-+        if buffered:
-+            self.printer.writelines(
-+                "context._push_buffer()",
-+                "try:"
-+            )
-+        self.write_variable_declares(body_identifiers)
-+        self.identifier_stack.append(body_identifiers)
-+
-+        for n in node.nodes:
-+            n.accept_visitor(self)
-+        self.identifier_stack.pop()
-+
-+        self.write_def_finish(node, buffered, False, False, callstack=False)
-+        self.printer.writelines(
-+            None,
-+            "return [%s]" % (','.join(export)),
-+            None
-+        )
-+
-+        self.printer.writelines(
-+            # push on caller for nested call
-+            "context.caller_stack.nextcaller = "
-+                "runtime.Namespace('caller', context, "
-+                                "callables=ccall(__M_caller))",
-+            "try:")
-+        self.printer.start_source(node.lineno)
-+        self.printer.writelines(
-+                "__M_writer(%s)" % self.create_filter_callable(
-+                                                    [], node.expression, True),
-+            "finally:",
-+                "context.caller_stack.nextcaller = None",
-+            None
-+        )
-+
-+class _Identifiers(object):
-+    """tracks the status of identifier names as template code is rendered."""
-+
-+    def __init__(self, compiler, node=None, parent=None, nested=False):
-+        if parent is not None:
-+            # if we are the branch created in write_namespaces(),
-+            # we don't share any context from the main body().
-+            if isinstance(node, parsetree.NamespaceTag):
-+                self.declared = set()
-+                self.topleveldefs = util.SetLikeDict()
-+            else:
-+                # things that have already been declared
-+                # in an enclosing namespace (i.e. names we can just use)
-+                self.declared = set(parent.declared).\
-+                        union([c.name for c in parent.closuredefs.values()]).\
-+                        union(parent.locally_declared).\
-+                        union(parent.argument_declared)
-+
-+                # if these identifiers correspond to a "nested"
-+                # scope, it means whatever the parent identifiers
-+                # had as undeclared will have been declared by that parent,
-+                # and therefore we have them in our scope.
-+                if nested:
-+                    self.declared = self.declared.union(parent.undeclared)
-+
-+                # top level defs that are available
-+                self.topleveldefs = util.SetLikeDict(**parent.topleveldefs)
-+        else:
-+            self.declared = set()
-+            self.topleveldefs = util.SetLikeDict()
-+
-+        self.compiler = compiler
-+
-+        # things within this level that are referenced before they
-+        # are declared (e.g. assigned to)
-+        self.undeclared = set()
-+
-+        # things that are declared locally.  some of these things
-+        # could be in the "undeclared" list as well if they are
-+        # referenced before declared
-+        self.locally_declared = set()
-+
-+        # assignments made in explicit python blocks.
-+        # these will be propagated to
-+        # the context of local def calls.
-+        self.locally_assigned = set()
-+
-+        # things that are declared in the argument
-+        # signature of the def callable
-+        self.argument_declared = set()
-+
-+        # closure defs that are defined in this level
-+        self.closuredefs = util.SetLikeDict()
-+
-+        self.node = node
-+
-+        if node is not None:
-+            node.accept_visitor(self)
-+
-+        illegal_names = self.compiler.reserved_names.intersection(
-+                                                        self.locally_declared)
-+        if illegal_names:
-+            raise exceptions.NameConflictError(
-+                "Reserved words declared in template: %s" %
-+                ", ".join(illegal_names))
-+
-+
-+    def branch(self, node, **kwargs):
-+        """create a new Identifiers for a new Node, with
-+          this Identifiers as the parent."""
-+
-+        return _Identifiers(self.compiler, node, self, **kwargs)
-+
-+    @property
-+    def defs(self):
-+        return set(self.topleveldefs.union(self.closuredefs).values())
-+
-+    def __repr__(self):
-+        return "Identifiers(declared=%r, locally_declared=%r, "\
-+                "undeclared=%r, topleveldefs=%r, closuredefs=%r, "\
-+                "argumentdeclared=%r)" %\
-+                (
-+                    list(self.declared),
-+                    list(self.locally_declared),
-+                    list(self.undeclared),
-+                    [c.name for c in self.topleveldefs.values()],
-+                    [c.name for c in self.closuredefs.values()],
-+                    self.argument_declared)
-+
-+    def check_declared(self, node):
-+        """update the state of this Identifiers with the undeclared
-+            and declared identifiers of the given node."""
-+
-+        for ident in node.undeclared_identifiers():
-+            if ident != 'context' and\
-+                    ident not in self.declared.union(self.locally_declared):
-+                self.undeclared.add(ident)
-+        for ident in node.declared_identifiers():
-+            self.locally_declared.add(ident)
-+
-+    def add_declared(self, ident):
-+        self.declared.add(ident)
-+        if ident in self.undeclared:
-+            self.undeclared.remove(ident)
-+
-+    def visitExpression(self, node):
-+        self.check_declared(node)
-+
-+    def visitControlLine(self, node):
-+        self.check_declared(node)
-+
-+    def visitCode(self, node):
-+        if not node.ismodule:
-+            self.check_declared(node)
-+            self.locally_assigned = self.locally_assigned.union(
-+                                                node.declared_identifiers())
-+
-+    def visitNamespaceTag(self, node):
-+        # only traverse into the sub-elements of a
-+        # <%namespace> tag if we are the branch created in
-+        # write_namespaces()
-+        if self.node is node:
-+            for n in node.nodes:
-+                n.accept_visitor(self)
-+
-+    def _check_name_exists(self, collection, node):
-+        existing = collection.get(node.funcname)
-+        collection[node.funcname] = node
-+        if existing is not None and \
-+            existing is not node and \
-+            (node.is_block or existing.is_block):
-+            raise exceptions.CompileException(
-+                    "%%def or %%block named '%s' already "
-+                    "exists in this template." %
-+                    node.funcname, **node.exception_kwargs)
-+
-+    def visitDefTag(self, node):
-+        if node.is_root() and not node.is_anonymous:
-+            self._check_name_exists(self.topleveldefs, node)
-+        elif node is not self.node:
-+            self._check_name_exists(self.closuredefs, node)
-+
-+        for ident in node.undeclared_identifiers():
-+            if ident != 'context' and \
-+                    ident not in self.declared.union(self.locally_declared):
-+                self.undeclared.add(ident)
-+
-+        # visit defs only one level deep
-+        if node is self.node:
-+            for ident in node.declared_identifiers():
-+                self.argument_declared.add(ident)
-+
-+            for n in node.nodes:
-+                n.accept_visitor(self)
-+
-+    def visitBlockTag(self, node):
-+        if node is not self.node and not node.is_anonymous:
-+
-+            if isinstance(self.node, parsetree.DefTag):
-+                raise exceptions.CompileException(
-+                        "Named block '%s' not allowed inside of def '%s'"
-+                        % (node.name, self.node.name), **node.exception_kwargs)
-+            elif isinstance(self.node,
-+                            (parsetree.CallTag, parsetree.CallNamespaceTag)):
-+                raise exceptions.CompileException(
-+                        "Named block '%s' not allowed inside of <%%call> tag"
-+                        % (node.name, ), **node.exception_kwargs)
-+
-+        for ident in node.undeclared_identifiers():
-+            if ident != 'context' and \
-+                    ident not in self.declared.union(self.locally_declared):
-+                self.undeclared.add(ident)
-+
-+        if not node.is_anonymous:
-+            self._check_name_exists(self.topleveldefs, node)
-+            self.undeclared.add(node.funcname)
-+        elif node is not self.node:
-+            self._check_name_exists(self.closuredefs, node)
-+        for ident in node.declared_identifiers():
-+            self.argument_declared.add(ident)
-+        for n in node.nodes:
-+            n.accept_visitor(self)
-+
-+    def visitTextTag(self, node):
-+        for ident in node.undeclared_identifiers():
-+            if ident != 'context' and \
-+                    ident not in self.declared.union(self.locally_declared):
-+                self.undeclared.add(ident)
-+
-+    def visitIncludeTag(self, node):
-+        self.check_declared(node)
-+
-+    def visitPageTag(self, node):
-+        for ident in node.declared_identifiers():
-+            self.argument_declared.add(ident)
-+        self.check_declared(node)
-+
-+    def visitCallNamespaceTag(self, node):
-+        self.visitCallTag(node)
-+
-+    def visitCallTag(self, node):
-+        if node is self.node:
-+            for ident in node.undeclared_identifiers():
-+                if ident != 'context' and \
-+                        ident not in self.declared.union(
-+                                                self.locally_declared):
-+                    self.undeclared.add(ident)
-+            for ident in node.declared_identifiers():
-+                self.argument_declared.add(ident)
-+            for n in node.nodes:
-+                n.accept_visitor(self)
-+        else:
-+            for ident in node.undeclared_identifiers():
-+                if ident != 'context' and \
-+                        ident not in self.declared.union(
-+                                                self.locally_declared):
-+                    self.undeclared.add(ident)
-+
-+
-+_FOR_LOOP = re.compile(
-+        r'^for\s+((?:\(?)\s*[A-Za-z_][A-Za-z_0-9]*'
-+        r'(?:\s*,\s*(?:[A-Za-z_][A-Za-z0-9_]*),??)*\s*(?:\)?))\s+in\s+(.*):'
-+)
-+
-+def mangle_mako_loop(node, printer):
-+    """converts a for loop into a context manager wrapped around a for loop
-+    when access to the `loop` variable has been detected in the for loop body
-+    """
-+    loop_variable = LoopVariable()
-+    node.accept_visitor(loop_variable)
-+    if loop_variable.detected:
-+        node.nodes[-1].has_loop_context = True
-+        match = _FOR_LOOP.match(node.text)
-+        if match:
-+            printer.writelines(
-+                    'loop = __M_loop._enter(%s)' % match.group(2),
-+                    'try:'
-+                    #'with __M_loop(%s) as loop:' % match.group(2)
-+            )
-+            text = 'for %s in loop:' % match.group(1)
-+        else:
-+            raise SyntaxError("Couldn't apply loop context: %s" % node.text)
-+    else:
-+        text = node.text
-+    return text
-+
-+
-+class LoopVariable(object):
-+    """A node visitor which looks for the name 'loop' within undeclared
-+    identifiers."""
-+
-+    def __init__(self):
-+        self.detected = False
-+
-+    def _loop_reference_detected(self, node):
-+        if 'loop' in node.undeclared_identifiers():
-+            self.detected = True
-+        else:
-+            for n in node.get_children():
-+                n.accept_visitor(self)
-+
-+    def visitControlLine(self, node):
-+        self._loop_reference_detected(node)
-+
-+    def visitCode(self, node):
-+        self._loop_reference_detected(node)
-+
-+    def visitExpression(self, node):
-+        self._loop_reference_detected(node)
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py
-new file mode 100644
-index 0000000..fe277bb
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py
-@@ -0,0 +1,174 @@
-+import sys
-+import time
-+
-+py3k = sys.version_info >= (3, 0)
-+py33 = sys.version_info >= (3, 3)
-+py2k = sys.version_info < (3,)
-+py26 = sys.version_info >= (2, 6)
-+jython = sys.platform.startswith('java')
-+win32 = sys.platform.startswith('win')
-+pypy = hasattr(sys, 'pypy_version_info')
-+
-+if py3k:
-+    from io import StringIO
-+    import builtins as compat_builtins
-+    from urllib.parse import quote_plus, unquote_plus
-+    from html.entities import codepoint2name, name2codepoint
-+    string_types = str,
-+    binary_type = bytes
-+    text_type = str
-+
-+    from io import BytesIO as byte_buffer
-+
-+    def u(s):
-+        return s
-+
-+    def b(s):
-+        return s.encode("latin-1")
-+
-+    def octal(lit):
-+        return eval("0o" + lit)
-+
-+else:
-+    import __builtin__ as compat_builtins
-+    try:
-+        from cStringIO import StringIO
-+    except:
-+        from StringIO import StringIO
-+
-+    byte_buffer = StringIO
-+
-+    from urllib import quote_plus, unquote_plus
-+    from htmlentitydefs import codepoint2name, name2codepoint
-+    string_types = basestring,
-+    binary_type = str
-+    text_type = unicode
-+
-+    def u(s):
-+        return unicode(s, "utf-8")
-+
-+    def b(s):
-+        return s
-+
-+    def octal(lit):
-+        return eval("0" + lit)
-+
-+
-+if py33:
-+    from importlib import machinery
-+    def load_module(module_id, path):
-+        return machinery.SourceFileLoader(module_id, path).load_module()
-+else:
-+    import imp
-+    def load_module(module_id, path):
-+        fp = open(path, 'rb')
-+        try:
-+            return imp.load_source(module_id, path, fp)
-+        finally:
-+            fp.close()
-+
-+
-+if py3k:
-+    def reraise(tp, value, tb=None, cause=None):
-+        if cause is not None:
-+            value.__cause__ = cause
-+        if value.__traceback__ is not tb:
-+            raise value.with_traceback(tb)
-+        raise value
-+else:
-+    exec("def reraise(tp, value, tb=None, cause=None):\n"
-+            "    raise tp, value, tb\n")
-+
-+
-+def exception_as():
-+    return sys.exc_info()[1]
-+
-+try:
-+    import threading
-+    if py3k:
-+        import _thread as thread
-+    else:
-+        import thread
-+except ImportError:
-+    import dummy_threading as threading
-+    if py3k:
-+        import _dummy_thread as thread
-+    else:
-+        import dummy_thread as thread
-+
-+if win32 or jython:
-+    time_func = time.clock
-+else:
-+    time_func = time.time
-+
-+try:
-+    from functools import partial
-+except:
-+    def partial(func, *args, **keywords):
-+        def newfunc(*fargs, **fkeywords):
-+            newkeywords = keywords.copy()
-+            newkeywords.update(fkeywords)
-+            return func(*(args + fargs), **newkeywords)
-+        return newfunc
-+
-+
-+all = all
-+import json
-+
-+def exception_name(exc):
-+    return exc.__class__.__name__
-+
-+try:
-+    from inspect import CO_VARKEYWORDS, CO_VARARGS
-+    def inspect_func_args(fn):
-+        if py3k:
-+            co = fn.__code__
-+        else:
-+            co = fn.func_code
-+
-+        nargs = co.co_argcount
-+        names = co.co_varnames
-+        args = list(names[:nargs])
-+
-+        varargs = None
-+        if co.co_flags & CO_VARARGS:
-+            varargs = co.co_varnames[nargs]
-+            nargs = nargs + 1
-+        varkw = None
-+        if co.co_flags & CO_VARKEYWORDS:
-+            varkw = co.co_varnames[nargs]
-+
-+        if py3k:
-+            return args, varargs, varkw, fn.__defaults__
-+        else:
-+            return args, varargs, varkw, fn.func_defaults
-+except ImportError:
-+    import inspect
-+    def inspect_func_args(fn):
-+        return inspect.getargspec(fn)
-+
-+if py3k:
-+    def callable(fn):
-+        return hasattr(fn, '__call__')
-+else:
-+    callable = callable
-+
-+
-+################################################
-+# cross-compatible metaclass implementation
-+# Copyright (c) 2010-2012 Benjamin Peterson
-+def with_metaclass(meta, base=object):
-+    """Create a base class with a metaclass."""
-+    return meta("%sBase" % meta.__name__, (base,), {})
-+################################################
-+
-+
-+def arg_stringname(func_arg):
-+    """Gets the string name of a kwarg or vararg
-+    In Python3.4 a function's args are
-+    of _ast.arg type not _ast.name
-+    """
-+    if hasattr(func_arg, 'arg'):
-+        return func_arg.arg
-+    else:
-+        return str(func_arg)
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py
-new file mode 100644
-index 0000000..c531f21
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py
-@@ -0,0 +1,373 @@
-+# mako/exceptions.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""exception classes"""
-+
-+import traceback
-+import sys
-+from mako import util, compat
-+
-+class MakoException(Exception):
-+    pass
-+
-+class RuntimeException(MakoException):
-+    pass
-+
-+def _format_filepos(lineno, pos, filename):
-+    if filename is None:
-+        return " at line: %d char: %d" % (lineno, pos)
-+    else:
-+        return " in file '%s' at line: %d char: %d" % (filename, lineno, pos)
-+
-+
-+class CompileException(MakoException):
-+    def __init__(self, message, source, lineno, pos, filename):
-+        MakoException.__init__(self,
-+                              message + _format_filepos(lineno, pos, filename))
-+        self.lineno = lineno
-+        self.pos = pos
-+        self.filename = filename
-+        self.source = source
-+
-+class SyntaxException(MakoException):
-+    def __init__(self, message, source, lineno, pos, filename):
-+        MakoException.__init__(self,
-+                              message + _format_filepos(lineno, pos, filename))
-+        self.lineno = lineno
-+        self.pos = pos
-+        self.filename = filename
-+        self.source = source
-+
-+class UnsupportedError(MakoException):
-+    """raised when a retired feature is used."""
-+
-+class NameConflictError(MakoException):
-+    """raised when a reserved word is used inappropriately"""
-+
-+class TemplateLookupException(MakoException):
-+    pass
-+
-+class TopLevelLookupException(TemplateLookupException):
-+    pass
-+
-+class RichTraceback(object):
-+    """Pull the current exception from the ``sys`` traceback and extracts
-+    Mako-specific template information.
-+
-+    See the usage examples in :ref:`handling_exceptions`.
-+
-+    """
-+    def __init__(self, error=None, traceback=None):
-+        self.source, self.lineno = "", 0
-+
-+        if error is None or traceback is None:
-+            t, value, tback = sys.exc_info()
-+
-+        if error is None:
-+            error = value or t
-+
-+        if traceback is None:
-+            traceback = tback
-+
-+        self.error = error
-+        self.records = self._init(traceback)
-+
-+        if isinstance(self.error, (CompileException, SyntaxException)):
-+            self.source = self.error.source
-+            self.lineno = self.error.lineno
-+            self._has_source = True
-+
-+        self._init_message()
-+
-+    @property
-+    def errorname(self):
-+        return compat.exception_name(self.error)
-+
-+    def _init_message(self):
-+        """Find a unicode representation of self.error"""
-+        try:
-+            self.message = compat.text_type(self.error)
-+        except UnicodeError:
-+            try:
-+                self.message = str(self.error)
-+            except UnicodeEncodeError:
-+                # Fallback to args as neither unicode nor
-+                # str(Exception(u'\xe6')) work in Python < 2.6
-+                self.message = self.error.args[0]
-+        if not isinstance(self.message, compat.text_type):
-+            self.message = compat.text_type(self.message, 'ascii', 'replace')
-+
-+    def _get_reformatted_records(self, records):
-+        for rec in records:
-+            if rec[6] is not None:
-+                yield (rec[4], rec[5], rec[2], rec[6])
-+            else:
-+                yield tuple(rec[0:4])
-+
-+    @property
-+    def traceback(self):
-+        """Return a list of 4-tuple traceback records (i.e. normal python
-+        format) with template-corresponding lines remapped to the originating
-+        template.
-+
-+        """
-+        return list(self._get_reformatted_records(self.records))
-+
-+    @property
-+    def reverse_records(self):
-+        return reversed(self.records)
-+
-+    @property
-+    def reverse_traceback(self):
-+        """Return the same data as traceback, except in reverse order.
-+        """
-+
-+        return list(self._get_reformatted_records(self.reverse_records))
-+
-+    def _init(self, trcback):
-+        """format a traceback from sys.exc_info() into 7-item tuples,
-+        containing the regular four traceback tuple items, plus the original
-+        template filename, the line number adjusted relative to the template
-+        source, and code line from that line number of the template."""
-+
-+        import mako.template
-+        mods = {}
-+        rawrecords = traceback.extract_tb(trcback)
-+        new_trcback = []
-+        for filename, lineno, function, line in rawrecords:
-+            if not line:
-+                line = ''
-+            try:
-+                (line_map, template_lines) = mods[filename]
-+            except KeyError:
-+                try:
-+                    info = mako.template._get_module_info(filename)
-+                    module_source = info.code
-+                    template_source = info.source
-+                    template_filename = info.template_filename or filename
-+                except KeyError:
-+                    # A normal .py file (not a Template)
-+                    if not compat.py3k:
-+                        try:
-+                            fp = open(filename, 'rb')
-+                            encoding = util.parse_encoding(fp)
-+                            fp.close()
-+                        except IOError:
-+                            encoding = None
-+                        if encoding:
-+                            line = line.decode(encoding)
-+                        else:
-+                            line = line.decode('ascii', 'replace')
-+                    new_trcback.append((filename, lineno, function, line,
-+                                            None, None, None, None))
-+                    continue
-+
-+                template_ln = 1
-+
-+                source_map = mako.template.ModuleInfo.\
-+                                get_module_source_metadata(
-+                                    module_source, full_line_map=True)
-+                line_map = source_map['full_line_map']
-+
-+                template_lines = [line for line in
-+                                    template_source.split("\n")]
-+                mods[filename] = (line_map, template_lines)
-+
-+            template_ln = line_map[lineno - 1]
-+
-+            if template_ln <= len(template_lines):
-+                template_line = template_lines[template_ln - 1]
-+            else:
-+                template_line = None
-+            new_trcback.append((filename, lineno, function,
-+                                line, template_filename, template_ln,
-+                                template_line, template_source))
-+        if not self.source:
-+            for l in range(len(new_trcback) - 1, 0, -1):
-+                if new_trcback[l][5]:
-+                    self.source = new_trcback[l][7]
-+                    self.lineno = new_trcback[l][5]
-+                    break
-+            else:
-+                if new_trcback:
-+                    try:
-+                        # A normal .py file (not a Template)
-+                        fp = open(new_trcback[-1][0], 'rb')
-+                        encoding = util.parse_encoding(fp)
-+                        fp.seek(0)
-+                        self.source = fp.read()
-+                        fp.close()
-+                        if encoding:
-+                            self.source = self.source.decode(encoding)
-+                    except IOError:
-+                        self.source = ''
-+                    self.lineno = new_trcback[-1][1]
-+        return new_trcback
-+
-+
-+def text_error_template(lookup=None):
-+    """Provides a template that renders a stack trace in a similar format to
-+    the Python interpreter, substituting source template filenames, line
-+    numbers and code for that of the originating source template, as
-+    applicable.
-+
-+    """
-+    import mako.template
-+    return mako.template.Template(r"""
-+<%page args="error=None, traceback=None"/>
-+<%!
-+    from mako.exceptions import RichTraceback
-+%>\
-+<%
-+    tback = RichTraceback(error=error, traceback=traceback)
-+%>\
-+Traceback (most recent call last):
-+% for (filename, lineno, function, line) in tback.traceback:
-+  File "${filename}", line ${lineno}, in ${function or '?'}
-+    ${line | trim}
-+% endfor
-+${tback.errorname}: ${tback.message}
-+""")
-+
-+
-+def _install_pygments():
-+    global syntax_highlight, pygments_html_formatter
-+    from mako.ext.pygmentplugin import syntax_highlight,\
-+            pygments_html_formatter
-+
-+def _install_fallback():
-+    global syntax_highlight, pygments_html_formatter
-+    from mako.filters import html_escape
-+    pygments_html_formatter = None
-+    def syntax_highlight(filename='', language=None):
-+        return html_escape
-+
-+def _install_highlighting():
-+    try:
-+        _install_pygments()
-+    except ImportError:
-+        _install_fallback()
-+_install_highlighting()
-+
-+def html_error_template():
-+    """Provides a template that renders a stack trace in an HTML format,
-+    providing an excerpt of code as well as substituting source template
-+    filenames, line numbers and code for that of the originating source
-+    template, as applicable.
-+
-+    The template's default ``encoding_errors`` value is
-+    ``'htmlentityreplace'``. The template has two options. With the
-+    ``full`` option disabled, only a section of an HTML document is
-+    returned. With the ``css`` option disabled, the default stylesheet
-+    won't be included.
-+
-+    """
-+    import mako.template
-+    return mako.template.Template(r"""
-+<%!
-+    from mako.exceptions import RichTraceback, syntax_highlight,\
-+            pygments_html_formatter
-+%>
-+<%page args="full=True, css=True, error=None, traceback=None"/>
-+% if full:
-+<html>
-+<head>
-+    <title>Mako Runtime Error</title>
-+% endif
-+% if css:
-+    <style>
-+        body { font-family:verdana; margin:10px 30px 10px 30px;}
-+        .stacktrace { margin:5px 5px 5px 5px; }
-+        .highlight { padding:0px 10px 0px 10px; background-color:#9F9FDF; }
-+        .nonhighlight { padding:0px; background-color:#DFDFDF; }
-+        .sample { padding:10px; margin:10px 10px 10px 10px;
-+                  font-family:monospace; }
-+        .sampleline { padding:0px 10px 0px 10px; }
-+        .sourceline { margin:5px 5px 10px 5px; font-family:monospace;}
-+        .location { font-size:80%; }
-+        .highlight { white-space:pre; }
-+        .sampleline { white-space:pre; }
-+
-+    % if pygments_html_formatter:
-+        ${pygments_html_formatter.get_style_defs()}
-+        .linenos { min-width: 2.5em; text-align: right; }
-+        pre { margin: 0; }
-+        .syntax-highlighted { padding: 0 10px; }
-+        .syntax-highlightedtable { border-spacing: 1px; }
-+        .nonhighlight { border-top: 1px solid #DFDFDF;
-+                        border-bottom: 1px solid #DFDFDF; }
-+        .stacktrace .nonhighlight { margin: 5px 15px 10px; }
-+        .sourceline { margin: 0 0; font-family:monospace; }
-+        .code { background-color: #F8F8F8; width: 100%; }
-+        .error .code { background-color: #FFBDBD; }
-+        .error .syntax-highlighted { background-color: #FFBDBD; }
-+    % endif
-+
-+    </style>
-+% endif
-+% if full:
-+</head>
-+<body>
-+% endif
-+
-+<h2>Error !</h2>
-+<%
-+    tback = RichTraceback(error=error, traceback=traceback)
-+    src = tback.source
-+    line = tback.lineno
-+    if src:
-+        lines = src.split('\n')
-+    else:
-+        lines = None
-+%>
-+<h3>${tback.errorname}: ${tback.message|h}</h3>
-+
-+% if lines:
-+    <div class="sample">
-+    <div class="nonhighlight">
-+% for index in range(max(0, line-4),min(len(lines), line+5)):
-+    <%
-+       if pygments_html_formatter:
-+           pygments_html_formatter.linenostart = index + 1
-+    %>
-+    % if index + 1 == line:
-+    <%
-+       if pygments_html_formatter:
-+           old_cssclass = pygments_html_formatter.cssclass
-+           pygments_html_formatter.cssclass = 'error ' + old_cssclass
-+    %>
-+        ${lines[index] | syntax_highlight(language='mako')}
-+    <%
-+       if pygments_html_formatter:
-+           pygments_html_formatter.cssclass = old_cssclass
-+    %>
-+    % else:
-+        ${lines[index] | syntax_highlight(language='mako')}
-+    % endif
-+% endfor
-+    </div>
-+    </div>
-+% endif
-+
-+<div class="stacktrace">
-+% for (filename, lineno, function, line) in tback.reverse_traceback:
-+    <div class="location">${filename}, line ${lineno}:</div>
-+    <div class="nonhighlight">
-+    <%
-+       if pygments_html_formatter:
-+           pygments_html_formatter.linenostart = lineno
-+    %>
-+      <div class="sourceline">${line | syntax_highlight(filename)}</div>
-+    </div>
-+% endfor
-+</div>
-+
-+% if full:
-+</body>
-+</html>
-+% endif
-+""", output_encoding=sys.getdefaultencoding(),
-+        encoding_errors='htmlentityreplace')
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py
-new file mode 100644
-index 0000000..d79ce23
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py
-@@ -0,0 +1,201 @@
-+# mako/filters.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+
-+import re
-+import codecs
-+
-+from mako.compat import quote_plus, unquote_plus, codepoint2name, \
-+        name2codepoint
-+
-+from mako import compat
-+
-+xml_escapes = {
-+    '&': '&amp;',
-+    '>': '&gt;',
-+    '<': '&lt;',
-+    '"': '&#34;',   # also &quot; in html-only
-+    "'": '&#39;'    # also &apos; in html-only
-+}
-+
-+# XXX: &quot; is valid in HTML and XML
-+#      &apos; is not valid HTML, but is valid XML
-+
-+def legacy_html_escape(s):
-+    """legacy HTML escape for non-unicode mode."""
-+    s = s.replace("&", "&amp;")
-+    s = s.replace(">", "&gt;")
-+    s = s.replace("<", "&lt;")
-+    s = s.replace('"', "&#34;")
-+    s = s.replace("'", "&#39;")
-+    return s
-+
-+
-+try:
-+    import markupsafe
-+    html_escape = markupsafe.escape
-+except ImportError:
-+    html_escape = legacy_html_escape
-+
-+def xml_escape(string):
-+    return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
-+
-+def url_escape(string):
-+    # convert into a list of octets
-+    string = string.encode("utf8")
-+    return quote_plus(string)
-+
-+def legacy_url_escape(string):
-+    # convert into a list of octets
-+    return quote_plus(string)
-+
-+def url_unescape(string):
-+    text = unquote_plus(string)
-+    if not is_ascii_str(text):
-+        text = text.decode("utf8")
-+    return text
-+
-+def trim(string):
-+    return string.strip()
-+
-+
-+class Decode(object):
-+    def __getattr__(self, key):
-+        def decode(x):
-+            if isinstance(x, compat.text_type):
-+                return x
-+            elif not isinstance(x, compat.binary_type):
-+                return decode(str(x))
-+            else:
-+                return compat.text_type(x, encoding=key)
-+        return decode
-+decode = Decode()
-+
-+
-+_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
-+
-+def is_ascii_str(text):
-+    return isinstance(text, str) and _ASCII_re.match(text)
-+
-+################################################################
-+
-+class XMLEntityEscaper(object):
-+    def __init__(self, codepoint2name, name2codepoint):
-+        self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n))
-+                                      for c, n in codepoint2name.items()])
-+        self.name2codepoint = name2codepoint
-+
-+    def escape_entities(self, text):
-+        """Replace characters with their character entity references.
-+
-+        Only characters corresponding to a named entity are replaced.
-+        """
-+        return compat.text_type(text).translate(self.codepoint2entity)
-+
-+    def __escape(self, m):
-+        codepoint = ord(m.group())
-+        try:
-+            return self.codepoint2entity[codepoint]
-+        except (KeyError, IndexError):
-+            return '&#x%X;' % codepoint
-+
-+
-+    __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
-+
-+    def escape(self, text):
-+        """Replace characters with their character references.
-+
-+        Replace characters by their named entity references.
-+        Non-ASCII characters, if they do not have a named entity reference,
-+        are replaced by numerical character references.
-+
-+        The return value is guaranteed to be ASCII.
-+        """
-+        return self.__escapable.sub(self.__escape, compat.text_type(text)
-+                                    ).encode('ascii')
-+
-+    # XXX: This regexp will not match all valid XML entity names__.
-+    # (It punts on details involving involving CombiningChars and Extenders.)
-+    #
-+    # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
-+    __characterrefs = re.compile(r'''& (?:
-+                                          \#(\d+)
-+                                          | \#x([\da-f]+)
-+                                          | ( (?!\d) [:\w] [-.:\w]+ )
-+                                          ) ;''',
-+                                 re.X | re.UNICODE)
-+
-+    def __unescape(self, m):
-+        dval, hval, name = m.groups()
-+        if dval:
-+            codepoint = int(dval)
-+        elif hval:
-+            codepoint = int(hval, 16)
-+        else:
-+            codepoint = self.name2codepoint.get(name, 0xfffd)
-+            # U+FFFD = "REPLACEMENT CHARACTER"
-+        if codepoint < 128:
-+            return chr(codepoint)
-+        return chr(codepoint)
-+
-+    def unescape(self, text):
-+        """Unescape character references.
-+
-+        All character references (both entity references and numerical
-+        character references) are unescaped.
-+        """
-+        return self.__characterrefs.sub(self.__unescape, text)
-+
-+
-+_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)
-+
-+html_entities_escape = _html_entities_escaper.escape_entities
-+html_entities_unescape = _html_entities_escaper.unescape
-+
-+
-+def htmlentityreplace_errors(ex):
-+    """An encoding error handler.
-+
-+    This python `codecs`_ error handler replaces unencodable
-+    characters with HTML entities, or, if no HTML entity exists for
-+    the character, XML character references.
-+
-+    >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
-+    'The cost was &euro;12.'
-+    """
-+    if isinstance(ex, UnicodeEncodeError):
-+        # Handle encoding errors
-+        bad_text = ex.object[ex.start:ex.end]
-+        text = _html_entities_escaper.escape(bad_text)
-+        return (compat.text_type(text), ex.end)
-+    raise ex
-+
-+codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
-+
-+
-+# TODO: options to make this dynamic per-compilation will be added in a later
-+# release
-+DEFAULT_ESCAPES = {
-+    'x': 'filters.xml_escape',
-+    'h': 'filters.html_escape',
-+    'u': 'filters.url_escape',
-+    'trim': 'filters.trim',
-+    'entity': 'filters.html_entities_escape',
-+    'unicode': 'unicode',
-+    'decode': 'decode',
-+    'str': 'str',
-+    'n': 'n'
-+}
-+
-+if compat.py3k:
-+    DEFAULT_ESCAPES.update({
-+        'unicode': 'str'
-+    })
-+
-+NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
-+NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape'
-+NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape'
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py
-new file mode 100644
-index 0000000..1dda398
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py
-@@ -0,0 +1,441 @@
-+# mako/lexer.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""provides the Lexer class for parsing template strings into parse trees."""
-+
-+import re
-+import codecs
-+from mako import parsetree, exceptions, compat
-+from mako.pygen import adjust_whitespace
-+
-+_regexp_cache = {}
-+
-+class Lexer(object):
-+    def __init__(self, text, filename=None,
-+                        disable_unicode=False,
-+                        input_encoding=None, preprocessor=None):
-+        self.text = text
-+        self.filename = filename
-+        self.template = parsetree.TemplateNode(self.filename)
-+        self.matched_lineno = 1
-+        self.matched_charpos = 0
-+        self.lineno = 1
-+        self.match_position = 0
-+        self.tag = []
-+        self.control_line = []
-+        self.ternary_stack = []
-+        self.disable_unicode = disable_unicode
-+        self.encoding = input_encoding
-+
-+        if compat.py3k and disable_unicode:
-+            raise exceptions.UnsupportedError(
-+                                    "Mako for Python 3 does not "
-+                                    "support disabling Unicode")
-+
-+        if preprocessor is None:
-+            self.preprocessor = []
-+        elif not hasattr(preprocessor, '__iter__'):
-+            self.preprocessor = [preprocessor]
-+        else:
-+            self.preprocessor = preprocessor
-+
-+    @property
-+    def exception_kwargs(self):
-+        return {'source': self.text,
-+                'lineno': self.matched_lineno,
-+                'pos': self.matched_charpos,
-+                'filename': self.filename}
-+
-+    def match(self, regexp, flags=None):
-+        """compile the given regexp, cache the reg, and call match_reg()."""
-+
-+        try:
-+            reg = _regexp_cache[(regexp, flags)]
-+        except KeyError:
-+            if flags:
-+                reg = re.compile(regexp, flags)
-+            else:
-+                reg = re.compile(regexp)
-+            _regexp_cache[(regexp, flags)] = reg
-+
-+        return self.match_reg(reg)
-+
-+    def match_reg(self, reg):
-+        """match the given regular expression object to the current text
-+        position.
-+
-+        if a match occurs, update the current text and line position.
-+
-+        """
-+
-+        mp = self.match_position
-+
-+        match = reg.match(self.text, self.match_position)
-+        if match:
-+            (start, end) = match.span()
-+            if end == start:
-+                self.match_position = end + 1
-+            else:
-+                self.match_position = end
-+            self.matched_lineno = self.lineno
-+            lines = re.findall(r"\n", self.text[mp:self.match_position])
-+            cp = mp - 1
-+            while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
-+                cp -= 1
-+            self.matched_charpos = mp - cp
-+            self.lineno += len(lines)
-+            #print "MATCHED:", match.group(0), "LINE START:",
-+            # self.matched_lineno, "LINE END:", self.lineno
-+        #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
-+        #          (match and "TRUE" or "FALSE")
-+        return match
-+
-+    def parse_until_text(self, *text):
-+        startpos = self.match_position
-+        text_re = r'|'.join(text)
-+        brace_level = 0
-+        while True:
-+            match = self.match(r'#.*\n')
-+            if match:
-+                continue
-+            match = self.match(r'(\"\"\"|\'\'\'|\"|\')((?<!\\)\\\1|.)*?\1',
-+                               re.S)
-+            if match:
-+                continue
-+            match = self.match(r'(%s)' % text_re)
-+            if match:
-+                if match.group(1) == '}' and brace_level > 0:
-+                    brace_level -= 1
-+                    continue
-+                return \
-+                    self.text[startpos:
-+                              self.match_position - len(match.group(1))],\
-+                    match.group(1)
-+            match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
-+            if match:
-+                brace_level += match.group(1).count('{')
-+                brace_level -= match.group(1).count('}')
-+                continue
-+            raise exceptions.SyntaxException(
-+                        "Expected: %s" %
-+                        ','.join(text),
-+                        **self.exception_kwargs)
-+
-+    def append_node(self, nodecls, *args, **kwargs):
-+        kwargs.setdefault('source', self.text)
-+        kwargs.setdefault('lineno', self.matched_lineno)
-+        kwargs.setdefault('pos', self.matched_charpos)
-+        kwargs['filename'] = self.filename
-+        node = nodecls(*args, **kwargs)
-+        if len(self.tag):
-+            self.tag[-1].nodes.append(node)
-+        else:
-+            self.template.nodes.append(node)
-+        # build a set of child nodes for the control line
-+        # (used for loop variable detection)
-+        # also build a set of child nodes on ternary control lines
-+        # (used for determining if a pass needs to be auto-inserted
-+        if self.control_line:
-+            control_frame = self.control_line[-1]
-+            control_frame.nodes.append(node)
-+            if not (isinstance(node, parsetree.ControlLine) and
-+                    control_frame.is_ternary(node.keyword)):
-+                if self.ternary_stack and self.ternary_stack[-1]:
-+                    self.ternary_stack[-1][-1].nodes.append(node)
-+        if isinstance(node, parsetree.Tag):
-+            if len(self.tag):
-+                node.parent = self.tag[-1]
-+            self.tag.append(node)
-+        elif isinstance(node, parsetree.ControlLine):
-+            if node.isend:
-+                self.control_line.pop()
-+                self.ternary_stack.pop()
-+            elif node.is_primary:
-+                self.control_line.append(node)
-+                self.ternary_stack.append([])
-+            elif self.control_line and \
-+                    self.control_line[-1].is_ternary(node.keyword):
-+                self.ternary_stack[-1].append(node)
-+            elif self.control_line and \
-+                    not self.control_line[-1].is_ternary(node.keyword):
-+                raise exceptions.SyntaxException(
-+                        "Keyword '%s' not a legal ternary for keyword '%s'" %
-+                        (node.keyword, self.control_line[-1].keyword),
-+                        **self.exception_kwargs)
-+
-+    _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
-+
-+    def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
-+        """given string/unicode or bytes/string, determine encoding
-+           from magic encoding comment, return body as unicode
-+           or raw if decode_raw=False
-+
-+        """
-+        if isinstance(text, compat.text_type):
-+            m = self._coding_re.match(text)
-+            encoding = m and m.group(1) or known_encoding or 'ascii'
-+            return encoding, text
-+
-+        if text.startswith(codecs.BOM_UTF8):
-+            text = text[len(codecs.BOM_UTF8):]
-+            parsed_encoding = 'utf-8'
-+            m = self._coding_re.match(text.decode('utf-8', 'ignore'))
-+            if m is not None and m.group(1) != 'utf-8':
-+                raise exceptions.CompileException(
-+                                "Found utf-8 BOM in file, with conflicting "
-+                                "magic encoding comment of '%s'" % m.group(1),
-+                                text.decode('utf-8', 'ignore'),
-+                                0, 0, filename)
-+        else:
-+            m = self._coding_re.match(text.decode('utf-8', 'ignore'))
-+            if m:
-+                parsed_encoding = m.group(1)
-+            else:
-+                parsed_encoding = known_encoding or 'ascii'
-+
-+        if decode_raw:
-+            try:
-+                text = text.decode(parsed_encoding)
-+            except UnicodeDecodeError:
-+                raise exceptions.CompileException(
-+                        "Unicode decode operation of encoding '%s' failed" %
-+                        parsed_encoding,
-+                        text.decode('utf-8', 'ignore'),
-+                        0, 0, filename)
-+
-+        return parsed_encoding, text
-+
-+    def parse(self):
-+        self.encoding, self.text = self.decode_raw_stream(self.text,
-+                                        not self.disable_unicode,
-+                                        self.encoding,
-+                                        self.filename,)
-+
-+        for preproc in self.preprocessor:
-+            self.text = preproc(self.text)
-+
-+        # push the match marker past the
-+        # encoding comment.
-+        self.match_reg(self._coding_re)
-+
-+        self.textlength = len(self.text)
-+
-+        while (True):
-+            if self.match_position > self.textlength:
-+                break
-+
-+            if self.match_end():
-+                break
-+            if self.match_expression():
-+                continue
-+            if self.match_control_line():
-+                continue
-+            if self.match_comment():
-+                continue
-+            if self.match_tag_start():
-+                continue
-+            if self.match_tag_end():
-+                continue
-+            if self.match_python_block():
-+                continue
-+            if self.match_text():
-+                continue
-+
-+            if self.match_position > self.textlength:
-+                break
-+            raise exceptions.CompileException("assertion failed")
-+
-+        if len(self.tag):
-+            raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
-+                                                self.tag[-1].keyword,
-+                                                **self.exception_kwargs)
-+        if len(self.control_line):
-+            raise exceptions.SyntaxException(
-+                                    "Unterminated control keyword: '%s'" %
-+                                    self.control_line[-1].keyword,
-+                                    self.text,
-+                                    self.control_line[-1].lineno,
-+                                    self.control_line[-1].pos, self.filename)
-+        return self.template
-+
-+    def match_tag_start(self):
-+        match = self.match(r'''
-+            \<%     # opening tag
-+
-+            ([\w\.\:]+)   # keyword
-+
-+            ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*)  # attrname, = \
-+                                               #        sign, string expression
-+
-+            \s*     # more whitespace
-+
-+            (/)?>   # closing
-+
-+            ''',
-+
-+            re.I | re.S | re.X)
-+
-+        if match:
-+            keyword, attr, isend = match.groups()
-+            self.keyword = keyword
-+            attributes = {}
-+            if attr:
-+                for att in re.findall(
-+                           r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
-+                    key, val1, val2 = att
-+                    text = val1 or val2
-+                    text = text.replace('\r\n', '\n')
-+                    attributes[key] = text
-+            self.append_node(parsetree.Tag, keyword, attributes)
-+            if isend:
-+                self.tag.pop()
-+            else:
-+                if keyword == 'text':
-+                    match = self.match(r'(.*?)(?=\</%text>)',  re.S)
-+                    if not match:
-+                        raise exceptions.SyntaxException(
-+                                            "Unclosed tag: <%%%s>" %
-+                                            self.tag[-1].keyword,
-+                                            **self.exception_kwargs)
-+                    self.append_node(parsetree.Text, match.group(1))
-+                    return self.match_tag_end()
-+            return True
-+        else:
-+            return False
-+
-+    def match_tag_end(self):
-+        match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
-+        if match:
-+            if not len(self.tag):
-+                raise exceptions.SyntaxException(
-+                                "Closing tag without opening tag: </%%%s>" %
-+                                match.group(1),
-+                                **self.exception_kwargs)
-+            elif self.tag[-1].keyword != match.group(1):
-+                raise exceptions.SyntaxException(
-+                            "Closing tag </%%%s> does not match tag: <%%%s>" %
-+                            (match.group(1), self.tag[-1].keyword),
-+                            **self.exception_kwargs)
-+            self.tag.pop()
-+            return True
-+        else:
-+            return False
-+
-+    def match_end(self):
-+        match = self.match(r'\Z', re.S)
-+        if match:
-+            string = match.group()
-+            if string:
-+                return string
-+            else:
-+                return True
-+        else:
-+            return False
-+
-+    def match_text(self):
-+        match = self.match(r"""
-+                (.*?)         # anything, followed by:
-+                (
-+                 (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
-+                                             # comment preceded by a
-+                                             # consumed newline and whitespace
-+                 |
-+                 (?=\${)      # an expression
-+                 |
-+                 (?=</?[%&])  # a substitution or block or call start or end
-+                              # - don't consume
-+                 |
-+                 (\\\r?\n)    # an escaped newline  - throw away
-+                 |
-+                 \Z           # end of string
-+                )""", re.X | re.S)
-+
-+        if match:
-+            text = match.group(1)
-+            if text:
-+                self.append_node(parsetree.Text, text)
-+            return True
-+        else:
-+            return False
-+
-+    def match_python_block(self):
-+        match = self.match(r"<%(!)?")
-+        if match:
-+            line, pos = self.matched_lineno, self.matched_charpos
-+            text, end = self.parse_until_text(r'%>')
-+            # the trailing newline helps
-+            # compiler.parse() not complain about indentation
-+            text = adjust_whitespace(text) + "\n"
-+            self.append_node(
-+                        parsetree.Code,
-+                        text,
-+                        match.group(1) == '!', lineno=line, pos=pos)
-+            return True
-+        else:
-+            return False
-+
-+    def match_expression(self):
-+        match = self.match(r"\${")
-+        if match:
-+            line, pos = self.matched_lineno, self.matched_charpos
-+            text, end = self.parse_until_text(r'\|', r'}')
-+            if end == '|':
-+                escapes, end = self.parse_until_text(r'}')
-+            else:
-+                escapes = ""
-+            text = text.replace('\r\n', '\n')
-+            self.append_node(
-+                            parsetree.Expression,
-+                            text, escapes.strip(),
-+                            lineno=line, pos=pos)
-+            return True
-+        else:
-+            return False
-+
-+    def match_control_line(self):
-+        match = self.match(
-+                    r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
-+                    r"(?:\r?\n|\Z)", re.M)
-+        if match:
-+            operator = match.group(1)
-+            text = match.group(2)
-+            if operator == '%':
-+                m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
-+                if not m2:
-+                    raise exceptions.SyntaxException(
-+                                "Invalid control line: '%s'" %
-+                                text,
-+                                **self.exception_kwargs)
-+                isend, keyword = m2.group(1, 2)
-+                isend = (isend is not None)
-+
-+                if isend:
-+                    if not len(self.control_line):
-+                        raise exceptions.SyntaxException(
-+                                "No starting keyword '%s' for '%s'" %
-+                                (keyword, text),
-+                                **self.exception_kwargs)
-+                    elif self.control_line[-1].keyword != keyword:
-+                        raise exceptions.SyntaxException(
-+                                "Keyword '%s' doesn't match keyword '%s'" %
-+                                (text, self.control_line[-1].keyword),
-+                                **self.exception_kwargs)
-+                self.append_node(parsetree.ControlLine, keyword, isend, text)
-+            else:
-+                self.append_node(parsetree.Comment, text)
-+            return True
-+        else:
-+            return False
-+
-+    def match_comment(self):
-+        """matches the multiline version of a comment"""
-+        match = self.match(r"<%doc>(.*?)</%doc>", re.S)
-+        if match:
-+            self.append_node(parsetree.Comment, match.group(1))
-+            return True
-+        else:
-+            return False
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py
-new file mode 100644
-index 0000000..2af5411
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py
-@@ -0,0 +1,359 @@
-+# mako/lookup.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+import os, stat, posixpath, re
-+from mako import exceptions, util
-+from mako.template import Template
-+
-+try:
-+    import threading
-+except:
-+    import dummy_threading as threading
-+
-+class TemplateCollection(object):
-+    """Represent a collection of :class:`.Template` objects,
-+    identifiable via URI.
-+
-+    A :class:`.TemplateCollection` is linked to the usage of
-+    all template tags that address other templates, such
-+    as ``<%include>``, ``<%namespace>``, and ``<%inherit>``.
-+    The ``file`` attribute of each of those tags refers
-+    to a string URI that is passed to that :class:`.Template`
-+    object's :class:`.TemplateCollection` for resolution.
-+
-+    :class:`.TemplateCollection` is an abstract class,
-+    with the usual default implementation being :class:`.TemplateLookup`.
-+
-+     """
-+
-+    def has_template(self, uri):
-+        """Return ``True`` if this :class:`.TemplateLookup` is
-+        capable of returning a :class:`.Template` object for the
-+        given ``uri``.
-+
-+        :param uri: String URI of the template to be resolved.
-+
-+        """
-+        try:
-+            self.get_template(uri)
-+            return True
-+        except exceptions.TemplateLookupException:
-+            return False
-+
-+    def get_template(self, uri, relativeto=None):
-+        """Return a :class:`.Template` object corresponding to the given
-+        ``uri``.
-+
-+        The default implementation raises
-+        :class:`.NotImplementedError`. Implementations should
-+        raise :class:`.TemplateLookupException` if the given ``uri``
-+        cannot be resolved.
-+
-+        :param uri: String URI of the template to be resolved.
-+        :param relativeto: if present, the given ``uri`` is assumed to
-+         be relative to this URI.
-+
-+        """
-+        raise NotImplementedError()
-+
-+    def filename_to_uri(self, uri, filename):
-+        """Convert the given ``filename`` to a URI relative to
-+           this :class:`.TemplateCollection`."""
-+
-+        return uri
-+
-+    def adjust_uri(self, uri, filename):
-+        """Adjust the given ``uri`` based on the calling ``filename``.
-+
-+        When this method is called from the runtime, the
-+        ``filename`` parameter is taken directly to the ``filename``
-+        attribute of the calling template. Therefore a custom
-+        :class:`.TemplateCollection` subclass can place any string
-+        identifier desired in the ``filename`` parameter of the
-+        :class:`.Template` objects it constructs and have them come back
-+        here.
-+
-+        """
-+        return uri
-+
-+class TemplateLookup(TemplateCollection):
-+    """Represent a collection of templates that locates template source files
-+    from the local filesystem.
-+
-+    The primary argument is the ``directories`` argument, the list of
-+    directories to search:
-+
-+    .. sourcecode:: python
-+
-+        lookup = TemplateLookup(["/path/to/templates"])
-+        some_template = lookup.get_template("/index.html")
-+
-+    The :class:`.TemplateLookup` can also be given :class:`.Template` objects
-+    programatically using :meth:`.put_string` or :meth:`.put_template`:
-+
-+    .. sourcecode:: python
-+
-+        lookup = TemplateLookup()
-+        lookup.put_string("base.html", '''
-+            <html><body>${self.next()}</body></html>
-+        ''')
-+        lookup.put_string("hello.html", '''
-+            <%include file='base.html'/>
-+
-+            Hello, world !
-+        ''')
-+
-+
-+    :param directories: A list of directory names which will be
-+     searched for a particular template URI. The URI is appended
-+     to each directory and the filesystem checked.
-+
-+    :param collection_size: Approximate size of the collection used
-+     to store templates. If left at its default of ``-1``, the size
-+     is unbounded, and a plain Python dictionary is used to
-+     relate URI strings to :class:`.Template` instances.
-+     Otherwise, a least-recently-used cache object is used which
-+     will maintain the size of the collection approximately to
-+     the number given.
-+
-+    :param filesystem_checks: When at its default value of ``True``,
-+     each call to :meth:`.TemplateLookup.get_template()` will
-+     compare the filesystem last modified time to the time in
-+     which an existing :class:`.Template` object was created.
-+     This allows the :class:`.TemplateLookup` to regenerate a
-+     new :class:`.Template` whenever the original source has
-+     been updated. Set this to ``False`` for a very minor
-+     performance increase.
-+
-+    :param modulename_callable: A callable which, when present,
-+     is passed the path of the source file as well as the
-+     requested URI, and then returns the full path of the
-+     generated Python module file. This is used to inject
-+     alternate schemes for Python module location. If left at
-+     its default of ``None``, the built in system of generation
-+     based on ``module_directory`` plus ``uri`` is used.
-+
-+    All other keyword parameters available for
-+    :class:`.Template` are mirrored here. When new
-+    :class:`.Template` objects are created, the keywords
-+    established with this :class:`.TemplateLookup` are passed on
-+    to each new :class:`.Template`.
-+
-+    """
-+
-+    def __init__(self,
-+                        directories=None,
-+                        module_directory=None,
-+                        filesystem_checks=True,
-+                        collection_size=-1,
-+                        format_exceptions=False,
-+                        error_handler=None,
-+                        disable_unicode=False,
-+                        bytestring_passthrough=False,
-+                        output_encoding=None,
-+                        encoding_errors='strict',
-+
-+                        cache_args=None,
-+                        cache_impl='beaker',
-+                        cache_enabled=True,
-+                        cache_type=None,
-+                        cache_dir=None,
-+                        cache_url=None,
-+
-+                        modulename_callable=None,
-+                        module_writer=None,
-+                        default_filters=None,
-+                        buffer_filters=(),
-+                        strict_undefined=False,
-+                        imports=None,
-+                        future_imports=None,
-+                        enable_loop=True,
-+                        input_encoding=None,
-+                        preprocessor=None,
-+                        lexer_cls=None):
-+
-+        self.directories = [posixpath.normpath(d) for d in
-+                            util.to_list(directories, ())
-+                            ]
-+        self.module_directory = module_directory
-+        self.modulename_callable = modulename_callable
-+        self.filesystem_checks = filesystem_checks
-+        self.collection_size = collection_size
-+
-+        if cache_args is None:
-+            cache_args = {}
-+        # transfer deprecated cache_* args
-+        if cache_dir:
-+            cache_args.setdefault('dir', cache_dir)
-+        if cache_url:
-+            cache_args.setdefault('url', cache_url)
-+        if cache_type:
-+            cache_args.setdefault('type', cache_type)
-+
-+        self.template_args = {
-+            'format_exceptions':format_exceptions,
-+            'error_handler':error_handler,
-+            'disable_unicode':disable_unicode,
-+            'bytestring_passthrough':bytestring_passthrough,
-+            'output_encoding':output_encoding,
-+            'cache_impl':cache_impl,
-+            'encoding_errors':encoding_errors,
-+            'input_encoding':input_encoding,
-+            'module_directory':module_directory,
-+            'module_writer':module_writer,
-+            'cache_args':cache_args,
-+            'cache_enabled':cache_enabled,
-+            'default_filters':default_filters,
-+            'buffer_filters':buffer_filters,
-+            'strict_undefined':strict_undefined,
-+            'imports':imports,
-+            'future_imports':future_imports,
-+            'enable_loop':enable_loop,
-+            'preprocessor':preprocessor,
-+            'lexer_cls':lexer_cls
-+        }
-+
-+        if collection_size == -1:
-+            self._collection = {}
-+            self._uri_cache = {}
-+        else:
-+            self._collection = util.LRUCache(collection_size)
-+            self._uri_cache = util.LRUCache(collection_size)
-+        self._mutex = threading.Lock()
-+
-+    def get_template(self, uri):
-+        """Return a :class:`.Template` object corresponding to the given
-+        ``uri``.
-+
-+        .. note:: The ``relativeto`` argument is not supported here at the moment.
-+
-+        """
-+
-+        try:
-+            if self.filesystem_checks:
-+                return self._check(uri, self._collection[uri])
-+            else:
-+                return self._collection[uri]
-+        except KeyError:
-+            u = re.sub(r'^\/+', '', uri)
-+            for dir in self.directories:
-+                srcfile = posixpath.normpath(posixpath.join(dir, u))
-+                if os.path.isfile(srcfile):
-+                    return self._load(srcfile, uri)
-+            else:
-+                raise exceptions.TopLevelLookupException(
-+                                    "Cant locate template for uri %r" % uri)
-+
-+    def adjust_uri(self, uri, relativeto):
-+        """Adjust the given ``uri`` based on the given relative URI."""
-+
-+        key = (uri, relativeto)
-+        if key in self._uri_cache:
-+            return self._uri_cache[key]
-+
-+        if uri[0] != '/':
-+            if relativeto is not None:
-+                v = self._uri_cache[key] = posixpath.join(
-+                                            posixpath.dirname(relativeto), uri)
-+            else:
-+                v = self._uri_cache[key] = '/' + uri
-+        else:
-+            v = self._uri_cache[key] = uri
-+        return v
-+
-+
-+    def filename_to_uri(self, filename):
-+        """Convert the given ``filename`` to a URI relative to
-+           this :class:`.TemplateCollection`."""
-+
-+        try:
-+            return self._uri_cache[filename]
-+        except KeyError:
-+            value = self._relativeize(filename)
-+            self._uri_cache[filename] = value
-+            return value
-+
-+    def _relativeize(self, filename):
-+        """Return the portion of a filename that is 'relative'
-+           to the directories in this lookup.
-+
-+        """
-+
-+        filename = posixpath.normpath(filename)
-+        for dir in self.directories:
-+            if filename[0:len(dir)] == dir:
-+                return filename[len(dir):]
-+        else:
-+            return None
-+
-+    def _load(self, filename, uri):
-+        self._mutex.acquire()
-+        try:
-+            try:
-+                # try returning from collection one
-+                # more time in case concurrent thread already loaded
-+                return self._collection[uri]
-+            except KeyError:
-+                pass
-+            try:
-+                if self.modulename_callable is not None:
-+                    module_filename = self.modulename_callable(filename, uri)
-+                else:
-+                    module_filename = None
-+                self._collection[uri] = template = Template(
-+                                        uri=uri,
-+                                        filename=posixpath.normpath(filename),
-+                                        lookup=self,
-+                                        module_filename=module_filename,
-+                                        **self.template_args)
-+                return template
-+            except:
-+                # if compilation fails etc, ensure
-+                # template is removed from collection,
-+                # re-raise
-+                self._collection.pop(uri, None)
-+                raise
-+        finally:
-+            self._mutex.release()
-+
-+    def _check(self, uri, template):
-+        if template.filename is None:
-+            return template
-+
-+        try:
-+            template_stat = os.stat(template.filename)
-+            if template.module._modified_time < \
-+                        template_stat[stat.ST_MTIME]:
-+                self._collection.pop(uri, None)
-+                return self._load(template.filename, uri)
-+            else:
-+                return template
-+        except OSError:
-+            self._collection.pop(uri, None)
-+            raise exceptions.TemplateLookupException(
-+                                "Cant locate template for uri %r" % uri)
-+
-+
-+    def put_string(self, uri, text):
-+        """Place a new :class:`.Template` object into this
-+        :class:`.TemplateLookup`, based on the given string of
-+        ``text``.
-+
-+        """
-+        self._collection[uri] = Template(
-+                                    text,
-+                                    lookup=self,
-+                                    uri=uri,
-+                                    **self.template_args)
-+
-+    def put_template(self, uri, template):
-+        """Place a new :class:`.Template` object into this
-+        :class:`.TemplateLookup`, based on the given
-+        :class:`.Template` object.
-+
-+        """
-+        self._collection[uri] = template
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py
-new file mode 100644
-index 0000000..49ec4e0
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py
-@@ -0,0 +1,594 @@
-+# mako/parsetree.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""defines the parse tree components for Mako templates."""
-+
-+from mako import exceptions, ast, util, filters, compat
-+import re
-+
-+class Node(object):
-+    """base class for a Node in the parse tree."""
-+
-+    def __init__(self, source, lineno, pos, filename):
-+        self.source = source
-+        self.lineno = lineno
-+        self.pos = pos
-+        self.filename = filename
-+
-+    @property
-+    def exception_kwargs(self):
-+        return {'source': self.source, 'lineno': self.lineno,
-+                'pos': self.pos, 'filename': self.filename}
-+
-+    def get_children(self):
-+        return []
-+
-+    def accept_visitor(self, visitor):
-+        def traverse(node):
-+            for n in node.get_children():
-+                n.accept_visitor(visitor)
-+
-+        method = getattr(visitor, "visit" + self.__class__.__name__, traverse)
-+        method(self)
-+
-+class TemplateNode(Node):
-+    """a 'container' node that stores the overall collection of nodes."""
-+
-+    def __init__(self, filename):
-+        super(TemplateNode, self).__init__('', 0, 0, filename)
-+        self.nodes = []
-+        self.page_attributes = {}
-+
-+    def get_children(self):
-+        return self.nodes
-+
-+    def __repr__(self):
-+        return "TemplateNode(%s, %r)" % (
-+                    util.sorted_dict_repr(self.page_attributes),
-+                    self.nodes)
-+
-+class ControlLine(Node):
-+    """defines a control line, a line-oriented python line or end tag.
-+
-+    e.g.::
-+
-+        % if foo:
-+            (markup)
-+        % endif
-+
-+    """
-+
-+    has_loop_context = False
-+
-+    def __init__(self, keyword, isend, text, **kwargs):
-+        super(ControlLine, self).__init__(**kwargs)
-+        self.text = text
-+        self.keyword = keyword
-+        self.isend = isend
-+        self.is_primary = keyword in ['for', 'if', 'while', 'try', 'with']
-+        self.nodes = []
-+        if self.isend:
-+            self._declared_identifiers = []
-+            self._undeclared_identifiers = []
-+        else:
-+            code = ast.PythonFragment(text, **self.exception_kwargs)
-+            self._declared_identifiers = code.declared_identifiers
-+            self._undeclared_identifiers = code.undeclared_identifiers
-+
-+    def get_children(self):
-+        return self.nodes
-+
-+    def declared_identifiers(self):
-+        return self._declared_identifiers
-+
-+    def undeclared_identifiers(self):
-+        return self._undeclared_identifiers
-+
-+    def is_ternary(self, keyword):
-+        """return true if the given keyword is a ternary keyword
-+        for this ControlLine"""
-+
-+        return keyword in {
-+            'if':set(['else', 'elif']),
-+            'try':set(['except', 'finally']),
-+            'for':set(['else'])
-+        }.get(self.keyword, [])
-+
-+    def __repr__(self):
-+        return "ControlLine(%r, %r, %r, %r)" % (
-+            self.keyword,
-+            self.text,
-+            self.isend,
-+            (self.lineno, self.pos)
-+        )
-+
-+class Text(Node):
-+    """defines plain text in the template."""
-+
-+    def __init__(self, content, **kwargs):
-+        super(Text, self).__init__(**kwargs)
-+        self.content = content
-+
-+    def __repr__(self):
-+        return "Text(%r, %r)" % (self.content, (self.lineno, self.pos))
-+
-+class Code(Node):
-+    """defines a Python code block, either inline or module level.
-+
-+    e.g.::
-+
-+        inline:
-+        <%
-+            x = 12
-+        %>
-+
-+        module level:
-+        <%!
-+            import logger
-+        %>
-+
-+    """
-+
-+    def __init__(self, text, ismodule, **kwargs):
-+        super(Code, self).__init__(**kwargs)
-+        self.text = text
-+        self.ismodule = ismodule
-+        self.code = ast.PythonCode(text, **self.exception_kwargs)
-+
-+    def declared_identifiers(self):
-+        return self.code.declared_identifiers
-+
-+    def undeclared_identifiers(self):
-+        return self.code.undeclared_identifiers
-+
-+    def __repr__(self):
-+        return "Code(%r, %r, %r)" % (
-+            self.text,
-+            self.ismodule,
-+            (self.lineno, self.pos)
-+        )
-+
-+class Comment(Node):
-+    """defines a comment line.
-+
-+    # this is a comment
-+
-+    """
-+
-+    def __init__(self, text, **kwargs):
-+        super(Comment, self).__init__(**kwargs)
-+        self.text = text
-+
-+    def __repr__(self):
-+        return "Comment(%r, %r)" % (self.text, (self.lineno, self.pos))
-+
-+class Expression(Node):
-+    """defines an inline expression.
-+
-+    ${x+y}
-+
-+    """
-+
-+    def __init__(self, text, escapes, **kwargs):
-+        super(Expression, self).__init__(**kwargs)
-+        self.text = text
-+        self.escapes = escapes
-+        self.escapes_code = ast.ArgumentList(escapes, **self.exception_kwargs)
-+        self.code = ast.PythonCode(text, **self.exception_kwargs)
-+
-+    def declared_identifiers(self):
-+        return []
-+
-+    def undeclared_identifiers(self):
-+        # TODO: make the "filter" shortcut list configurable at parse/gen time
-+        return self.code.undeclared_identifiers.union(
-+                self.escapes_code.undeclared_identifiers.difference(
-+                    set(filters.DEFAULT_ESCAPES.keys())
-+                )
-+            ).difference(self.code.declared_identifiers)
-+
-+    def __repr__(self):
-+        return "Expression(%r, %r, %r)" % (
-+            self.text,
-+            self.escapes_code.args,
-+            (self.lineno, self.pos)
-+        )
-+
-+class _TagMeta(type):
-+    """metaclass to allow Tag to produce a subclass according to
-+    its keyword"""
-+
-+    _classmap = {}
-+
-+    def __init__(cls, clsname, bases, dict):
-+        if getattr(cls, '__keyword__', None) is not None:
-+            cls._classmap[cls.__keyword__] = cls
-+        super(_TagMeta, cls).__init__(clsname, bases, dict)
-+
-+    def __call__(cls, keyword, attributes, **kwargs):
-+        if ":" in keyword:
-+            ns, defname = keyword.split(':')
-+            return type.__call__(CallNamespaceTag, ns, defname,
-+                                        attributes, **kwargs)
-+
-+        try:
-+            cls = _TagMeta._classmap[keyword]
-+        except KeyError:
-+            raise exceptions.CompileException(
-+                "No such tag: '%s'" % keyword,
-+                source=kwargs['source'],
-+                lineno=kwargs['lineno'],
-+                pos=kwargs['pos'],
-+                filename=kwargs['filename']
-+            )
-+        return type.__call__(cls, keyword, attributes, **kwargs)
-+
-+class Tag(compat.with_metaclass(_TagMeta, Node)):
-+    """abstract base class for tags.
-+
-+    <%sometag/>
-+
-+    <%someothertag>
-+        stuff
-+    </%someothertag>
-+
-+    """
-+    __keyword__ = None
-+
-+    def __init__(self, keyword, attributes, expressions,
-+                        nonexpressions, required, **kwargs):
-+        """construct a new Tag instance.
-+
-+        this constructor not called directly, and is only called
-+        by subclasses.
-+
-+        :param keyword: the tag keyword
-+
-+        :param attributes: raw dictionary of attribute key/value pairs
-+
-+        :param expressions: a set of identifiers that are legal attributes,
-+         which can also contain embedded expressions
-+
-+        :param nonexpressions: a set of identifiers that are legal
-+         attributes, which cannot contain embedded expressions
-+
-+        :param \**kwargs:
-+         other arguments passed to the Node superclass (lineno, pos)
-+
-+        """
-+        super(Tag, self).__init__(**kwargs)
-+        self.keyword = keyword
-+        self.attributes = attributes
-+        self._parse_attributes(expressions, nonexpressions)
-+        missing = [r for r in required if r not in self.parsed_attributes]
-+        if len(missing):
-+            raise exceptions.CompileException(
-+                "Missing attribute(s): %s" %
-+                    ",".join([repr(m) for m in missing]),
-+                **self.exception_kwargs)
-+        self.parent = None
-+        self.nodes = []
-+
-+    def is_root(self):
-+        return self.parent is None
-+
-+    def get_children(self):
-+        return self.nodes
-+
-+    def _parse_attributes(self, expressions, nonexpressions):
-+        undeclared_identifiers = set()
-+        self.parsed_attributes = {}
-+        for key in self.attributes:
-+            if key in expressions:
-+                expr = []
-+                for x in re.compile(r'(\${.+?})',
-+                                    re.S).split(self.attributes[key]):
-+                    m = re.compile(r'^\${(.+?)}$', re.S).match(x)
-+                    if m:
-+                        code = ast.PythonCode(m.group(1).rstrip(),
-+                                **self.exception_kwargs)
-+                        # we aren't discarding "declared_identifiers" here,
-+                        # which we do so that list comprehension-declared
-+                        # variables aren't counted.   As yet can't find a
-+                        # condition that requires it here.
-+                        undeclared_identifiers = \
-+                            undeclared_identifiers.union(
-+                                    code.undeclared_identifiers)
-+                        expr.append('(%s)' % m.group(1))
-+                    else:
-+                        if x:
-+                            expr.append(repr(x))
-+                self.parsed_attributes[key] = " + ".join(expr) or repr('')
-+            elif key in nonexpressions:
-+                if re.search(r'\${.+?}', self.attributes[key]):
-+                    raise exceptions.CompileException(
-+                           "Attibute '%s' in tag '%s' does not allow embedded "
-+                           "expressions"  % (key, self.keyword),
-+                           **self.exception_kwargs)
-+                self.parsed_attributes[key] = repr(self.attributes[key])
-+            else:
-+                raise exceptions.CompileException(
-+                                    "Invalid attribute for tag '%s': '%s'" %
-+                                    (self.keyword, key),
-+                                    **self.exception_kwargs)
-+        self.expression_undeclared_identifiers = undeclared_identifiers
-+
-+    def declared_identifiers(self):
-+        return []
-+
-+    def undeclared_identifiers(self):
-+        return self.expression_undeclared_identifiers
-+
-+    def __repr__(self):
-+        return "%s(%r, %s, %r, %r)" % (self.__class__.__name__,
-+                                    self.keyword,
-+                                    util.sorted_dict_repr(self.attributes),
-+                                    (self.lineno, self.pos),
-+                                    self.nodes
-+                                )
-+
-+class IncludeTag(Tag):
-+    __keyword__ = 'include'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        super(IncludeTag, self).__init__(
-+                                    keyword,
-+                                    attributes,
-+                                    ('file', 'import', 'args'),
-+                                    (), ('file',), **kwargs)
-+        self.page_args = ast.PythonCode(
-+                                "__DUMMY(%s)" % attributes.get('args', ''),
-+                                 **self.exception_kwargs)
-+
-+    def declared_identifiers(self):
-+        return []
-+
-+    def undeclared_identifiers(self):
-+        identifiers = self.page_args.undeclared_identifiers.\
-+                            difference(set(["__DUMMY"])).\
-+                            difference(self.page_args.declared_identifiers)
-+        return identifiers.union(super(IncludeTag, self).
-+                                    undeclared_identifiers())
-+
-+class NamespaceTag(Tag):
-+    __keyword__ = 'namespace'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        super(NamespaceTag, self).__init__(
-+                                        keyword, attributes,
-+                                        ('file',),
-+                                        ('name','inheritable',
-+                                        'import','module'),
-+                                        (), **kwargs)
-+
-+        self.name = attributes.get('name', '__anon_%s' % hex(abs(id(self))))
-+        if not 'name' in attributes and not 'import' in attributes:
-+            raise exceptions.CompileException(
-+                "'name' and/or 'import' attributes are required "
-+                "for <%namespace>",
-+                **self.exception_kwargs)
-+        if 'file' in attributes and 'module' in attributes:
-+            raise exceptions.CompileException(
-+                "<%namespace> may only have one of 'file' or 'module'",
-+                **self.exception_kwargs
-+            )
-+
-+    def declared_identifiers(self):
-+        return []
-+
-+class TextTag(Tag):
-+    __keyword__ = 'text'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        super(TextTag, self).__init__(
-+                                    keyword,
-+                                    attributes, (),
-+                                    ('filter'), (), **kwargs)
-+        self.filter_args = ast.ArgumentList(
-+                                    attributes.get('filter', ''),
-+                                    **self.exception_kwargs)
-+
-+    def undeclared_identifiers(self):
-+        return self.filter_args.\
-+                            undeclared_identifiers.\
-+                            difference(filters.DEFAULT_ESCAPES.keys()).union(
-+                        self.expression_undeclared_identifiers
-+                    )
-+
-+class DefTag(Tag):
-+    __keyword__ = 'def'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        expressions = ['buffered', 'cached'] + [
-+                c for c in attributes if c.startswith('cache_')]
-+
-+
-+        super(DefTag, self).__init__(
-+                keyword,
-+                attributes,
-+                expressions,
-+                ('name', 'filter', 'decorator'),
-+                ('name',),
-+                **kwargs)
-+        name = attributes['name']
-+        if re.match(r'^[\w_]+$', name):
-+            raise exceptions.CompileException(
-+                                "Missing parenthesis in %def",
-+                                **self.exception_kwargs)
-+        self.function_decl = ast.FunctionDecl("def " + name + ":pass",
-+                                                    **self.exception_kwargs)
-+        self.name = self.function_decl.funcname
-+        self.decorator = attributes.get('decorator', '')
-+        self.filter_args = ast.ArgumentList(
-+                                attributes.get('filter', ''),
-+                                **self.exception_kwargs)
-+
-+    is_anonymous = False
-+    is_block = False
-+
-+    @property
-+    def funcname(self):
-+        return self.function_decl.funcname
-+
-+    def get_argument_expressions(self, **kw):
-+        return self.function_decl.get_argument_expressions(**kw)
-+
-+    def declared_identifiers(self):
-+        return self.function_decl.allargnames
-+
-+    def undeclared_identifiers(self):
-+        res = []
-+        for c in self.function_decl.defaults:
-+            res += list(ast.PythonCode(c, **self.exception_kwargs).
-+                                    undeclared_identifiers)
-+        return set(res).union(
-+            self.filter_args.\
-+                            undeclared_identifiers.\
-+                            difference(filters.DEFAULT_ESCAPES.keys())
-+        ).union(
-+            self.expression_undeclared_identifiers
-+        ).difference(
-+            self.function_decl.allargnames
-+        )
-+
-+class BlockTag(Tag):
-+    __keyword__ = 'block'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        expressions = ['buffered', 'cached', 'args'] + [
-+                 c for c in attributes if c.startswith('cache_')]
-+
-+        super(BlockTag, self).__init__(
-+                keyword,
-+                attributes,
-+                expressions,
-+                ('name','filter', 'decorator'),
-+                (),
-+                **kwargs)
-+        name = attributes.get('name')
-+        if name and not re.match(r'^[\w_]+$',name):
-+            raise exceptions.CompileException(
-+                               "%block may not specify an argument signature",
-+                               **self.exception_kwargs)
-+        if not name and attributes.get('args', None):
-+            raise exceptions.CompileException(
-+                                "Only named %blocks may specify args",
-+                                **self.exception_kwargs
-+                                )
-+        self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
-+                                            **self.exception_kwargs)
-+
-+        self.name = name
-+        self.decorator = attributes.get('decorator', '')
-+        self.filter_args = ast.ArgumentList(
-+                                attributes.get('filter', ''),
-+                                **self.exception_kwargs)
-+
-+
-+    is_block = True
-+
-+    @property
-+    def is_anonymous(self):
-+        return self.name is None
-+
-+    @property
-+    def funcname(self):
-+        return self.name or "__M_anon_%d" % (self.lineno, )
-+
-+    def get_argument_expressions(self, **kw):
-+        return self.body_decl.get_argument_expressions(**kw)
-+
-+    def declared_identifiers(self):
-+        return self.body_decl.allargnames
-+
-+    def undeclared_identifiers(self):
-+        return (self.filter_args.\
-+                            undeclared_identifiers.\
-+                            difference(filters.DEFAULT_ESCAPES.keys())
-+                ).union(self.expression_undeclared_identifiers)
-+
-+
-+
-+class CallTag(Tag):
-+    __keyword__ = 'call'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        super(CallTag, self).__init__(keyword, attributes,
-+                                    ('args'), ('expr',), ('expr',), **kwargs)
-+        self.expression = attributes['expr']
-+        self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
-+        self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
-+                                            **self.exception_kwargs)
-+
-+    def declared_identifiers(self):
-+        return self.code.declared_identifiers.union(self.body_decl.allargnames)
-+
-+    def undeclared_identifiers(self):
-+        return self.code.undeclared_identifiers.\
-+                    difference(self.code.declared_identifiers)
-+
-+class CallNamespaceTag(Tag):
-+
-+    def __init__(self, namespace, defname, attributes, **kwargs):
-+        super(CallNamespaceTag, self).__init__(
-+                    namespace + ":" + defname,
-+                    attributes,
-+                    tuple(attributes.keys()) + ('args', ),
-+                    (),
-+                    (),
-+                    **kwargs)
-+
-+        self.expression = "%s.%s(%s)" % (
-+                                namespace,
-+                                defname,
-+                                ",".join(["%s=%s" % (k, v) for k, v in
-+                                            self.parsed_attributes.items()
-+                                            if k != 'args'])
-+                            )
-+        self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
-+        self.body_decl = ast.FunctionArgs(
-+                                    attributes.get('args', ''),
-+                                    **self.exception_kwargs)
-+
-+    def declared_identifiers(self):
-+        return self.code.declared_identifiers.union(self.body_decl.allargnames)
-+
-+    def undeclared_identifiers(self):
-+        return self.code.undeclared_identifiers.\
-+                    difference(self.code.declared_identifiers)
-+
-+class InheritTag(Tag):
-+    __keyword__ = 'inherit'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        super(InheritTag, self).__init__(
-+                                keyword, attributes,
-+                                ('file',), (), ('file',), **kwargs)
-+
-+class PageTag(Tag):
-+    __keyword__ = 'page'
-+
-+    def __init__(self, keyword, attributes, **kwargs):
-+        expressions =   ['cached', 'args', 'expression_filter', 'enable_loop'] + [
-+                    c for c in attributes if c.startswith('cache_')]
-+
-+        super(PageTag, self).__init__(
-+                keyword,
-+                attributes,
-+                expressions,
-+                (),
-+                (),
-+                **kwargs)
-+        self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
-+                                            **self.exception_kwargs)
-+        self.filter_args = ast.ArgumentList(
-+                                attributes.get('expression_filter', ''),
-+                                **self.exception_kwargs)
-+
-+    def declared_identifiers(self):
-+        return self.body_decl.allargnames
-+
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py
-new file mode 100644
-index 0000000..5ba5125
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py
-@@ -0,0 +1,299 @@
-+# mako/pygen.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""utilities for generating and formatting literal Python code."""
-+
-+import re
-+from mako import exceptions
-+
-+class PythonPrinter(object):
-+    def __init__(self, stream):
-+        # indentation counter
-+        self.indent = 0
-+
-+        # a stack storing information about why we incremented
-+        # the indentation counter, to help us determine if we
-+        # should decrement it
-+        self.indent_detail = []
-+
-+        # the string of whitespace multiplied by the indent
-+        # counter to produce a line
-+        self.indentstring = "    "
-+
-+        # the stream we are writing to
-+        self.stream = stream
-+
-+        # current line number
-+        self.lineno = 1
-+
-+        # a list of lines that represents a buffered "block" of code,
-+        # which can be later printed relative to an indent level
-+        self.line_buffer = []
-+
-+        self.in_indent_lines = False
-+
-+        self._reset_multi_line_flags()
-+
-+        # mapping of generated python lines to template
-+        # source lines
-+        self.source_map = {}
-+
-+    def _update_lineno(self, num):
-+        self.lineno += num
-+
-+    def start_source(self, lineno):
-+        if self.lineno not in self.source_map:
-+            self.source_map[self.lineno] = lineno
-+
-+    def write_blanks(self, num):
-+        self.stream.write("\n" * num)
-+        self._update_lineno(num)
-+
-+    def write_indented_block(self, block):
-+        """print a line or lines of python which already contain indentation.
-+
-+        The indentation of the total block of lines will be adjusted to that of
-+        the current indent level."""
-+        self.in_indent_lines = False
-+        for l in re.split(r'\r?\n', block):
-+            self.line_buffer.append(l)
-+            self._update_lineno(1)
-+
-+    def writelines(self, *lines):
-+        """print a series of lines of python."""
-+        for line in lines:
-+            self.writeline(line)
-+
-+    def writeline(self, line):
-+        """print a line of python, indenting it according to the current
-+        indent level.
-+
-+        this also adjusts the indentation counter according to the
-+        content of the line.
-+
-+        """
-+
-+        if not self.in_indent_lines:
-+            self._flush_adjusted_lines()
-+            self.in_indent_lines = True
-+
-+        if (line is None or
-+            re.match(r"^\s*#",line) or
-+            re.match(r"^\s*$", line)
-+            ):
-+            hastext = False
-+        else:
-+            hastext = True
-+
-+        is_comment = line and len(line) and line[0] == '#'
-+
-+        # see if this line should decrease the indentation level
-+        if (not is_comment and
-+            (not hastext or self._is_unindentor(line))
-+            ):
-+
-+            if self.indent > 0:
-+                self.indent -= 1
-+                # if the indent_detail stack is empty, the user
-+                # probably put extra closures - the resulting
-+                # module wont compile.
-+                if len(self.indent_detail) == 0:
-+                    raise exceptions.SyntaxException(
-+                                    "Too many whitespace closures")
-+                self.indent_detail.pop()
-+
-+        if line is None:
-+            return
-+
-+        # write the line
-+        self.stream.write(self._indent_line(line) + "\n")
-+        self._update_lineno(len(line.split("\n")))
-+
-+        # see if this line should increase the indentation level.
-+        # note that a line can both decrase (before printing) and
-+        # then increase (after printing) the indentation level.
-+
-+        if re.search(r":[ \t]*(?:#.*)?$", line):
-+            # increment indentation count, and also
-+            # keep track of what the keyword was that indented us,
-+            # if it is a python compound statement keyword
-+            # where we might have to look for an "unindent" keyword
-+            match = re.match(r"^\s*(if|try|elif|while|for|with)", line)
-+            if match:
-+                # its a "compound" keyword, so we will check for "unindentors"
-+                indentor = match.group(1)
-+                self.indent += 1
-+                self.indent_detail.append(indentor)
-+            else:
-+                indentor = None
-+                # its not a "compound" keyword.  but lets also
-+                # test for valid Python keywords that might be indenting us,
-+                # else assume its a non-indenting line
-+                m2 = re.match(r"^\s*(def|class|else|elif|except|finally)",
-+                              line)
-+                if m2:
-+                    self.indent += 1
-+                    self.indent_detail.append(indentor)
-+
-+    def close(self):
-+        """close this printer, flushing any remaining lines."""
-+        self._flush_adjusted_lines()
-+
-+    def _is_unindentor(self, line):
-+        """return true if the given line is an 'unindentor',
-+        relative to the last 'indent' event received.
-+
-+        """
-+
-+        # no indentation detail has been pushed on; return False
-+        if len(self.indent_detail) == 0:
-+            return False
-+
-+        indentor = self.indent_detail[-1]
-+
-+        # the last indent keyword we grabbed is not a
-+        # compound statement keyword; return False
-+        if indentor is None:
-+            return False
-+
-+        # if the current line doesnt have one of the "unindentor" keywords,
-+        # return False
-+        match = re.match(r"^\s*(else|elif|except|finally).*\:", line)
-+        if not match:
-+            return False
-+
-+        # whitespace matches up, we have a compound indentor,
-+        # and this line has an unindentor, this
-+        # is probably good enough
-+        return True
-+
-+        # should we decide that its not good enough, heres
-+        # more stuff to check.
-+        #keyword = match.group(1)
-+
-+        # match the original indent keyword
-+        #for crit in [
-+        #   (r'if|elif', r'else|elif'),
-+        #   (r'try', r'except|finally|else'),
-+        #   (r'while|for', r'else'),
-+        #]:
-+        #   if re.match(crit[0], indentor) and re.match(crit[1], keyword):
-+        #        return True
-+
-+        #return False
-+
-+    def _indent_line(self, line, stripspace=''):
-+        """indent the given line according to the current indent level.
-+
-+        stripspace is a string of space that will be truncated from the
-+        start of the line before indenting."""
-+
-+        return re.sub(r"^%s" % stripspace, self.indentstring
-+                      * self.indent, line)
-+
-+    def _reset_multi_line_flags(self):
-+        """reset the flags which would indicate we are in a backslashed
-+        or triple-quoted section."""
-+
-+        self.backslashed, self.triplequoted = False, False
-+
-+    def _in_multi_line(self, line):
-+        """return true if the given line is part of a multi-line block,
-+        via backslash or triple-quote."""
-+
-+        # we are only looking for explicitly joined lines here, not
-+        # implicit ones (i.e. brackets, braces etc.).  this is just to
-+        # guard against the possibility of modifying the space inside of
-+        # a literal multiline string with unfortunately placed
-+        # whitespace
-+
-+        current_state = (self.backslashed or self.triplequoted)
-+
-+        if re.search(r"\\$", line):
-+            self.backslashed = True
-+        else:
-+            self.backslashed = False
-+
-+        triples = len(re.findall(r"\"\"\"|\'\'\'", line))
-+        if triples == 1 or triples % 2 != 0:
-+            self.triplequoted = not self.triplequoted
-+
-+        return current_state
-+
-+    def _flush_adjusted_lines(self):
-+        stripspace = None
-+        self._reset_multi_line_flags()
-+
-+        for entry in self.line_buffer:
-+            if self._in_multi_line(entry):
-+                self.stream.write(entry + "\n")
-+            else:
-+                entry = entry.expandtabs()
-+                if stripspace is None and re.search(r"^[ \t]*[^# \t]", entry):
-+                    stripspace = re.match(r"^([ \t]*)", entry).group(1)
-+                self.stream.write(self._indent_line(entry, stripspace) + "\n")
-+
-+        self.line_buffer = []
-+        self._reset_multi_line_flags()
-+
-+
-+def adjust_whitespace(text):
-+    """remove the left-whitespace margin of a block of Python code."""
-+
-+    state = [False, False]
-+    (backslashed, triplequoted) = (0, 1)
-+
-+    def in_multi_line(line):
-+        start_state = (state[backslashed] or state[triplequoted])
-+
-+        if re.search(r"\\$", line):
-+            state[backslashed] = True
-+        else:
-+            state[backslashed] = False
-+
-+        def match(reg, t):
-+            m = re.match(reg, t)
-+            if m:
-+                return m, t[len(m.group(0)):]
-+            else:
-+                return None, t
-+
-+        while line:
-+            if state[triplequoted]:
-+                m, line = match(r"%s" % state[triplequoted], line)
-+                if m:
-+                    state[triplequoted] = False
-+                else:
-+                    m, line = match(r".*?(?=%s|$)" % state[triplequoted], line)
-+            else:
-+                m, line = match(r'#', line)
-+                if m:
-+                    return start_state
-+
-+                m, line = match(r"\"\"\"|\'\'\'", line)
-+                if m:
-+                    state[triplequoted] = m.group(0)
-+                    continue
-+
-+                m, line = match(r".*?(?=\"\"\"|\'\'\'|#|$)", line)
-+
-+        return start_state
-+
-+    def _indent_line(line, stripspace=''):
-+        return re.sub(r"^%s" % stripspace, '', line)
-+
-+    lines = []
-+    stripspace = None
-+
-+    for line in re.split(r'\r?\n', text):
-+        if in_multi_line(line):
-+            lines.append(line)
-+        else:
-+            line = line.expandtabs()
-+            if stripspace is None and re.search(r"^[ \t]*[^# \t]", line):
-+                stripspace = re.match(r"^([ \t]*)", line).group(1)
-+            lines.append(_indent_line(line, stripspace))
-+    return "\n".join(lines)
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py
-new file mode 100644
-index 0000000..bfa46a9
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py
-@@ -0,0 +1,232 @@
-+# mako/pyparser.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""Handles parsing of Python code.
-+
-+Parsing to AST is done via _ast on Python > 2.5, otherwise the compiler
-+module is used.
-+"""
-+
-+from mako import exceptions, util, compat
-+from mako.compat import arg_stringname
-+import operator
-+
-+if compat.py3k:
-+    # words that cannot be assigned to (notably
-+    # smaller than the total keys in __builtins__)
-+    reserved = set(['True', 'False', 'None', 'print'])
-+
-+    # the "id" attribute on a function node
-+    arg_id = operator.attrgetter('arg')
-+else:
-+    # words that cannot be assigned to (notably
-+    # smaller than the total keys in __builtins__)
-+    reserved = set(['True', 'False', 'None'])
-+
-+    # the "id" attribute on a function node
-+    arg_id = operator.attrgetter('id')
-+
-+import _ast
-+util.restore__ast(_ast)
-+from mako import _ast_util
-+
-+
-+def parse(code, mode='exec', **exception_kwargs):
-+    """Parse an expression into AST"""
-+
-+    try:
-+        return _ast_util.parse(code, '<unknown>', mode)
-+    except Exception:
-+        raise exceptions.SyntaxException(
-+                    "(%s) %s (%r)" % (
-+                        compat.exception_as().__class__.__name__,
-+                        compat.exception_as(),
-+                        code[0:50]
-+                    ), **exception_kwargs)
-+
-+
-+class FindIdentifiers(_ast_util.NodeVisitor):
-+
-+    def __init__(self, listener, **exception_kwargs):
-+        self.in_function = False
-+        self.in_assign_targets = False
-+        self.local_ident_stack = set()
-+        self.listener = listener
-+        self.exception_kwargs = exception_kwargs
-+
-+    def _add_declared(self, name):
-+        if not self.in_function:
-+            self.listener.declared_identifiers.add(name)
-+        else:
-+            self.local_ident_stack.add(name)
-+
-+    def visit_ClassDef(self, node):
-+        self._add_declared(node.name)
-+
-+    def visit_Assign(self, node):
-+
-+        # flip around the visiting of Assign so the expression gets
-+        # evaluated first, in the case of a clause like "x=x+5" (x
-+        # is undeclared)
-+
-+        self.visit(node.value)
-+        in_a = self.in_assign_targets
-+        self.in_assign_targets = True
-+        for n in node.targets:
-+            self.visit(n)
-+        self.in_assign_targets = in_a
-+
-+    if compat.py3k:
-+
-+        # ExceptHandler is in Python 2, but this block only works in
-+        # Python 3 (and is required there)
-+
-+        def visit_ExceptHandler(self, node):
-+            if node.name is not None:
-+                self._add_declared(node.name)
-+            if node.type is not None:
-+                self.visit(node.type)
-+            for statement in node.body:
-+                self.visit(statement)
-+
-+    def visit_Lambda(self, node, *args):
-+        self._visit_function(node, True)
-+
-+    def visit_FunctionDef(self, node):
-+        self._add_declared(node.name)
-+        self._visit_function(node, False)
-+
-+    def _expand_tuples(self, args):
-+        for arg in args:
-+            if isinstance(arg, _ast.Tuple):
-+                for n in arg.elts:
-+                    yield n
-+            else:
-+                yield arg
-+
-+    def _visit_function(self, node, islambda):
-+
-+        # push function state onto stack.  dont log any more
-+        # identifiers as "declared" until outside of the function,
-+        # but keep logging identifiers as "undeclared". track
-+        # argument names in each function header so they arent
-+        # counted as "undeclared"
-+
-+        inf = self.in_function
-+        self.in_function = True
-+
-+        local_ident_stack = self.local_ident_stack
-+        self.local_ident_stack = local_ident_stack.union([
-+            arg_id(arg) for arg in self._expand_tuples(node.args.args)
-+        ])
-+        if islambda:
-+            self.visit(node.body)
-+        else:
-+            for n in node.body:
-+                self.visit(n)
-+        self.in_function = inf
-+        self.local_ident_stack = local_ident_stack
-+
-+    def visit_For(self, node):
-+
-+        # flip around visit
-+
-+        self.visit(node.iter)
-+        self.visit(node.target)
-+        for statement in node.body:
-+            self.visit(statement)
-+        for statement in node.orelse:
-+            self.visit(statement)
-+
-+    def visit_Name(self, node):
-+        if isinstance(node.ctx, _ast.Store):
-+            # this is eqiuvalent to visit_AssName in
-+            # compiler
-+            self._add_declared(node.id)
-+        elif node.id not in reserved and node.id \
-+            not in self.listener.declared_identifiers and node.id \
-+                not in self.local_ident_stack:
-+            self.listener.undeclared_identifiers.add(node.id)
-+
-+    def visit_Import(self, node):
-+        for name in node.names:
-+            if name.asname is not None:
-+                self._add_declared(name.asname)
-+            else:
-+                self._add_declared(name.name.split('.')[0])
-+
-+    def visit_ImportFrom(self, node):
-+        for name in node.names:
-+            if name.asname is not None:
-+                self._add_declared(name.asname)
-+            else:
-+                if name.name == '*':
-+                    raise exceptions.CompileException(
-+                        "'import *' is not supported, since all identifier "
-+                        "names must be explicitly declared.  Please use the "
-+                        "form 'from <modulename> import <name1>, <name2>, "
-+                        "...' instead.", **self.exception_kwargs)
-+                self._add_declared(name.name)
-+
-+
-+class FindTuple(_ast_util.NodeVisitor):
-+
-+    def __init__(self, listener, code_factory, **exception_kwargs):
-+        self.listener = listener
-+        self.exception_kwargs = exception_kwargs
-+        self.code_factory = code_factory
-+
-+    def visit_Tuple(self, node):
-+        for n in node.elts:
-+            p = self.code_factory(n, **self.exception_kwargs)
-+            self.listener.codeargs.append(p)
-+            self.listener.args.append(ExpressionGenerator(n).value())
-+            self.listener.declared_identifiers = \
-+                self.listener.declared_identifiers.union(
-+                                                p.declared_identifiers)
-+            self.listener.undeclared_identifiers = \
-+                self.listener.undeclared_identifiers.union(
-+                                                p.undeclared_identifiers)
-+
-+
-+class ParseFunc(_ast_util.NodeVisitor):
-+
-+    def __init__(self, listener, **exception_kwargs):
-+        self.listener = listener
-+        self.exception_kwargs = exception_kwargs
-+
-+    def visit_FunctionDef(self, node):
-+        self.listener.funcname = node.name
-+
-+        argnames = [arg_id(arg) for arg in node.args.args]
-+        if node.args.vararg:
-+            argnames.append(arg_stringname(node.args.vararg))
-+
-+        if compat.py2k:
-+            # kw-only args don't exist in Python 2
-+            kwargnames = []
-+        else:
-+            kwargnames = [arg_id(arg) for arg in node.args.kwonlyargs]
-+        if node.args.kwarg:
-+            kwargnames.append(arg_stringname(node.args.kwarg))
-+        self.listener.argnames = argnames
-+        self.listener.defaults = node.args.defaults  # ast
-+        self.listener.kwargnames = kwargnames
-+        if compat.py2k:
-+            self.listener.kwdefaults = []
-+        else:
-+            self.listener.kwdefaults = node.args.kw_defaults
-+        self.listener.varargs = node.args.vararg
-+        self.listener.kwargs = node.args.kwarg
-+
-+class ExpressionGenerator(object):
-+
-+    def __init__(self, astnode):
-+        self.generator = _ast_util.SourceGenerator(' ' * 4)
-+        self.generator.visit(astnode)
-+
-+    def value(self):
-+        return ''.join(self.generator.result)
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py
-new file mode 100644
-index 0000000..6b6a35a
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py
-@@ -0,0 +1,878 @@
-+# mako/runtime.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""provides runtime services for templates, including Context,
-+Namespace, and various helper functions."""
-+
-+from mako import exceptions, util, compat
-+from mako.compat import compat_builtins
-+import sys
-+
-+
-+class Context(object):
-+    """Provides runtime namespace, output buffer, and various
-+    callstacks for templates.
-+
-+    See :ref:`runtime_toplevel` for detail on the usage of
-+    :class:`.Context`.
-+
-+     """
-+
-+    def __init__(self, buffer, **data):
-+        self._buffer_stack = [buffer]
-+
-+        self._data = data
-+
-+        self._kwargs = data.copy()
-+        self._with_template = None
-+        self._outputting_as_unicode = None
-+        self.namespaces = {}
-+
-+        # "capture" function which proxies to the
-+        # generic "capture" function
-+        self._data['capture'] = compat.partial(capture, self)
-+
-+        # "caller" stack used by def calls with content
-+        self.caller_stack = self._data['caller'] = CallerStack()
-+
-+    def _set_with_template(self, t):
-+        self._with_template = t
-+        illegal_names = t.reserved_names.intersection(self._data)
-+        if illegal_names:
-+            raise exceptions.NameConflictError(
-+                "Reserved words passed to render(): %s" %
-+                ", ".join(illegal_names))
-+
-+    @property
-+    def lookup(self):
-+        """Return the :class:`.TemplateLookup` associated
-+        with this :class:`.Context`.
-+
-+        """
-+        return self._with_template.lookup
-+
-+    @property
-+    def kwargs(self):
-+        """Return the dictionary of top level keyword arguments associated
-+        with this :class:`.Context`.
-+
-+        This dictionary only includes the top-level arguments passed to
-+        :meth:`.Template.render`.  It does not include names produced within
-+        the template execution such as local variable names or special names
-+        such as ``self``, ``next``, etc.
-+
-+        The purpose of this dictionary is primarily for the case that
-+        a :class:`.Template` accepts arguments via its ``<%page>`` tag,
-+        which are normally expected to be passed via :meth:`.Template.render`,
-+        except the template is being called in an inheritance context,
-+        using the ``body()`` method.   :attr:`.Context.kwargs` can then be
-+        used to propagate these arguments to the inheriting template::
-+
-+            ${next.body(**context.kwargs)}
-+
-+        """
-+        return self._kwargs.copy()
-+
-+    def push_caller(self, caller):
-+        """Push a ``caller`` callable onto the callstack for
-+        this :class:`.Context`."""
-+
-+
-+        self.caller_stack.append(caller)
-+
-+    def pop_caller(self):
-+        """Pop a ``caller`` callable onto the callstack for this
-+        :class:`.Context`."""
-+
-+        del self.caller_stack[-1]
-+
-+    def keys(self):
-+        """Return a list of all names established in this :class:`.Context`."""
-+
-+        return list(self._data.keys())
-+
-+    def __getitem__(self, key):
-+        if key in self._data:
-+            return self._data[key]
-+        else:
-+            return compat_builtins.__dict__[key]
-+
-+    def _push_writer(self):
-+        """push a capturing buffer onto this Context and return
-+        the new writer function."""
-+
-+        buf = util.FastEncodingBuffer()
-+        self._buffer_stack.append(buf)
-+        return buf.write
-+
-+    def _pop_buffer_and_writer(self):
-+        """pop the most recent capturing buffer from this Context
-+        and return the current writer after the pop.
-+
-+        """
-+
-+        buf = self._buffer_stack.pop()
-+        return buf, self._buffer_stack[-1].write
-+
-+    def _push_buffer(self):
-+        """push a capturing buffer onto this Context."""
-+
-+        self._push_writer()
-+
-+    def _pop_buffer(self):
-+        """pop the most recent capturing buffer from this Context."""
-+
-+        return self._buffer_stack.pop()
-+
-+    def get(self, key, default=None):
-+        """Return a value from this :class:`.Context`."""
-+
-+        return self._data.get(key, compat_builtins.__dict__.get(key, default))
-+
-+    def write(self, string):
-+        """Write a string to this :class:`.Context` object's
-+        underlying output buffer."""
-+
-+        self._buffer_stack[-1].write(string)
-+
-+    def writer(self):
-+        """Return the current writer function."""
-+
-+        return self._buffer_stack[-1].write
-+
-+    def _copy(self):
-+        c = Context.__new__(Context)
-+        c._buffer_stack = self._buffer_stack
-+        c._data = self._data.copy()
-+        c._kwargs = self._kwargs
-+        c._with_template = self._with_template
-+        c._outputting_as_unicode = self._outputting_as_unicode
-+        c.namespaces = self.namespaces
-+        c.caller_stack = self.caller_stack
-+        return c
-+
-+    def _locals(self, d):
-+        """Create a new :class:`.Context` with a copy of this
-+        :class:`.Context`'s current state,
-+        updated with the given dictionary.
-+
-+        The :attr:`.Context.kwargs` collection remains
-+        unaffected.
-+
-+
-+        """
-+
-+        if not d:
-+            return self
-+        c = self._copy()
-+        c._data.update(d)
-+        return c
-+
-+    def _clean_inheritance_tokens(self):
-+        """create a new copy of this :class:`.Context`. with
-+        tokens related to inheritance state removed."""
-+
-+        c = self._copy()
-+        x = c._data
-+        x.pop('self', None)
-+        x.pop('parent', None)
-+        x.pop('next', None)
-+        return c
-+
-+class CallerStack(list):
-+    def __init__(self):
-+        self.nextcaller = None
-+
-+    def __nonzero__(self):
-+        return self.__bool__()
-+
-+    def __bool__(self):
-+        return len(self) and self._get_caller() and True or False
-+
-+    def _get_caller(self):
-+        # this method can be removed once
-+        # codegen MAGIC_NUMBER moves past 7
-+        return self[-1]
-+
-+    def __getattr__(self, key):
-+        return getattr(self._get_caller(), key)
-+
-+    def _push_frame(self):
-+        frame = self.nextcaller or None
-+        self.append(frame)
-+        self.nextcaller = None
-+        return frame
-+
-+    def _pop_frame(self):
-+        self.nextcaller = self.pop()
-+
-+
-+class Undefined(object):
-+    """Represents an undefined value in a template.
-+
-+    All template modules have a constant value
-+    ``UNDEFINED`` present which is an instance of this
-+    object.
-+
-+    """
-+    def __str__(self):
-+        raise NameError("Undefined")
-+
-+    def __nonzero__(self):
-+        return self.__bool__()
-+
-+    def __bool__(self):
-+        return False
-+
-+UNDEFINED = Undefined()
-+
-+class LoopStack(object):
-+    """a stack for LoopContexts that implements the context manager protocol
-+    to automatically pop off the top of the stack on context exit
-+    """
-+
-+    def __init__(self):
-+        self.stack = []
-+
-+    def _enter(self, iterable):
-+        self._push(iterable)
-+        return self._top
-+
-+    def _exit(self):
-+        self._pop()
-+        return self._top
-+
-+    @property
-+    def _top(self):
-+        if self.stack:
-+            return self.stack[-1]
-+        else:
-+            return self
-+
-+    def _pop(self):
-+        return self.stack.pop()
-+
-+    def _push(self, iterable):
-+        new = LoopContext(iterable)
-+        if self.stack:
-+            new.parent = self.stack[-1]
-+        return self.stack.append(new)
-+
-+    def __getattr__(self, key):
-+        raise exceptions.RuntimeException("No loop context is established")
-+
-+    def __iter__(self):
-+        return iter(self._top)
-+
-+
-+class LoopContext(object):
-+    """A magic loop variable.
-+    Automatically accessible in any ``% for`` block.
-+
-+    See the section :ref:`loop_context` for usage
-+    notes.
-+
-+    :attr:`parent` -> :class:`.LoopContext` or ``None``
-+        The parent loop, if one exists.
-+    :attr:`index` -> `int`
-+        The 0-based iteration count.
-+    :attr:`reverse_index` -> `int`
-+        The number of iterations remaining.
-+    :attr:`first` -> `bool`
-+        ``True`` on the first iteration, ``False`` otherwise.
-+    :attr:`last` -> `bool`
-+        ``True`` on the last iteration, ``False`` otherwise.
-+    :attr:`even` -> `bool`
-+        ``True`` when ``index`` is even.
-+    :attr:`odd` -> `bool`
-+        ``True`` when ``index`` is odd.
-+    """
-+
-+    def __init__(self, iterable):
-+        self._iterable = iterable
-+        self.index = 0
-+        self.parent = None
-+
-+    def __iter__(self):
-+        for i in self._iterable:
-+            yield i
-+            self.index += 1
-+
-+    @util.memoized_instancemethod
-+    def __len__(self):
-+        return len(self._iterable)
-+
-+    @property
-+    def reverse_index(self):
-+        return len(self) - self.index - 1
-+
-+    @property
-+    def first(self):
-+        return self.index == 0
-+
-+    @property
-+    def last(self):
-+        return self.index == len(self) - 1
-+
-+    @property
-+    def even(self):
-+        return not self.odd
-+
-+    @property
-+    def odd(self):
-+        return bool(self.index % 2)
-+
-+    def cycle(self, *values):
-+        """Cycle through values as the loop progresses.
-+        """
-+        if not values:
-+            raise ValueError("You must provide values to cycle through")
-+        return values[self.index % len(values)]
-+
-+
-+class _NSAttr(object):
-+    def __init__(self, parent):
-+        self.__parent = parent
-+    def __getattr__(self, key):
-+        ns = self.__parent
-+        while ns:
-+            if hasattr(ns.module, key):
-+                return getattr(ns.module, key)
-+            else:
-+                ns = ns.inherits
-+        raise AttributeError(key)
-+
-+class Namespace(object):
-+    """Provides access to collections of rendering methods, which
-+      can be local, from other templates, or from imported modules.
-+
-+      To access a particular rendering method referenced by a
-+      :class:`.Namespace`, use plain attribute access:
-+
-+      .. sourcecode:: mako
-+
-+        ${some_namespace.foo(x, y, z)}
-+
-+      :class:`.Namespace` also contains several built-in attributes
-+      described here.
-+
-+      """
-+
-+    def __init__(self, name, context,
-+                            callables=None, inherits=None,
-+                            populate_self=True, calling_uri=None):
-+        self.name = name
-+        self.context = context
-+        self.inherits = inherits
-+        if callables is not None:
-+            self.callables = dict([(c.__name__, c) for c in callables])
-+
-+    callables = ()
-+
-+    module = None
-+    """The Python module referenced by this :class:`.Namespace`.
-+
-+    If the namespace references a :class:`.Template`, then
-+    this module is the equivalent of ``template.module``,
-+    i.e. the generated module for the template.
-+
-+    """
-+
-+    template = None
-+    """The :class:`.Template` object referenced by this
-+        :class:`.Namespace`, if any.
-+
-+    """
-+
-+    context = None
-+    """The :class:`.Context` object for this :class:`.Namespace`.
-+
-+    Namespaces are often created with copies of contexts that
-+    contain slightly different data, particularly in inheritance
-+    scenarios. Using the :class:`.Context` off of a :class:`.Namespace` one
-+    can traverse an entire chain of templates that inherit from
-+    one-another.
-+
-+    """
-+
-+    filename = None
-+    """The path of the filesystem file used for this
-+    :class:`.Namespace`'s module or template.
-+
-+    If this is a pure module-based
-+    :class:`.Namespace`, this evaluates to ``module.__file__``. If a
-+    template-based namespace, it evaluates to the original
-+    template file location.
-+
-+    """
-+
-+    uri = None
-+    """The URI for this :class:`.Namespace`'s template.
-+
-+    I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
-+
-+    This is the equivalent of :attr:`.Template.uri`.
-+
-+    """
-+
-+    _templateuri = None
-+
-+    @util.memoized_property
-+    def attr(self):
-+        """Access module level attributes by name.
-+
-+        This accessor allows templates to supply "scalar"
-+        attributes which are particularly handy in inheritance
-+        relationships.
-+
-+        .. seealso::
-+
-+            :ref:`inheritance_attr`
-+
-+            :ref:`namespace_attr_for_includes`
-+
-+        """
-+        return _NSAttr(self)
-+
-+    def get_namespace(self, uri):
-+        """Return a :class:`.Namespace` corresponding to the given ``uri``.
-+
-+        If the given ``uri`` is a relative URI (i.e. it does not
-+        contain a leading slash ``/``), the ``uri`` is adjusted to
-+        be relative to the ``uri`` of the namespace itself. This
-+        method is therefore mostly useful off of the built-in
-+        ``local`` namespace, described in :ref:`namespace_local`.
-+
-+        In
-+        most cases, a template wouldn't need this function, and
-+        should instead use the ``<%namespace>`` tag to load
-+        namespaces. However, since all ``<%namespace>`` tags are
-+        evaluated before the body of a template ever runs,
-+        this method can be used to locate namespaces using
-+        expressions that were generated within the body code of
-+        the template, or to conditionally use a particular
-+        namespace.
-+
-+        """
-+        key = (self, uri)
-+        if key in self.context.namespaces:
-+            return self.context.namespaces[key]
-+        else:
-+            ns = TemplateNamespace(uri, self.context._copy(),
-+                                templateuri=uri,
-+                                calling_uri=self._templateuri)
-+            self.context.namespaces[key] = ns
-+            return ns
-+
-+    def get_template(self, uri):
-+        """Return a :class:`.Template` from the given ``uri``.
-+
-+        The ``uri`` resolution is relative to the ``uri`` of this
-+        :class:`.Namespace` object's :class:`.Template`.
-+
-+        """
-+        return _lookup_template(self.context, uri, self._templateuri)
-+
-+    def get_cached(self, key, **kwargs):
-+        """Return a value from the :class:`.Cache` referenced by this
-+        :class:`.Namespace` object's :class:`.Template`.
-+
-+        The advantage to this method versus direct access to the
-+        :class:`.Cache` is that the configuration parameters
-+        declared in ``<%page>`` take effect here, thereby calling
-+        up the same configured backend as that configured
-+        by ``<%page>``.
-+
-+        """
-+
-+        return self.cache.get(key, **kwargs)
-+
-+    @property
-+    def cache(self):
-+        """Return the :class:`.Cache` object referenced
-+        by this :class:`.Namespace` object's
-+        :class:`.Template`.
-+
-+        """
-+        return self.template.cache
-+
-+    def include_file(self, uri, **kwargs):
-+        """Include a file at the given ``uri``."""
-+
-+        _include_file(self.context, uri, self._templateuri, **kwargs)
-+
-+    def _populate(self, d, l):
-+        for ident in l:
-+            if ident == '*':
-+                for (k, v) in self._get_star():
-+                    d[k] = v
-+            else:
-+                d[ident] = getattr(self, ident)
-+
-+    def _get_star(self):
-+        if self.callables:
-+            for key in self.callables:
-+                yield (key, self.callables[key])
-+
-+    def __getattr__(self, key):
-+        if key in self.callables:
-+            val = self.callables[key]
-+        elif self.inherits:
-+            val = getattr(self.inherits, key)
-+        else:
-+            raise AttributeError(
-+                    "Namespace '%s' has no member '%s'" %
-+                    (self.name, key))
-+        setattr(self, key, val)
-+        return val
-+
-+class TemplateNamespace(Namespace):
-+    """A :class:`.Namespace` specific to a :class:`.Template` instance."""
-+
-+    def __init__(self, name, context, template=None, templateuri=None,
-+                            callables=None, inherits=None,
-+                            populate_self=True, calling_uri=None):
-+        self.name = name
-+        self.context = context
-+        self.inherits = inherits
-+        if callables is not None:
-+            self.callables = dict([(c.__name__, c) for c in callables])
-+
-+        if templateuri is not None:
-+            self.template = _lookup_template(context, templateuri,
-+                                                calling_uri)
-+            self._templateuri = self.template.module._template_uri
-+        elif template is not None:
-+            self.template = template
-+            self._templateuri = template.module._template_uri
-+        else:
-+            raise TypeError("'template' argument is required.")
-+
-+        if populate_self:
-+            lclcallable, lclcontext = \
-+                        _populate_self_namespace(context, self.template,
-+                                                    self_ns=self)
-+
-+    @property
-+    def module(self):
-+        """The Python module referenced by this :class:`.Namespace`.
-+
-+        If the namespace references a :class:`.Template`, then
-+        this module is the equivalent of ``template.module``,
-+        i.e. the generated module for the template.
-+
-+        """
-+        return self.template.module
-+
-+    @property
-+    def filename(self):
-+        """The path of the filesystem file used for this
-+        :class:`.Namespace`'s module or template.
-+        """
-+        return self.template.filename
-+
-+    @property
-+    def uri(self):
-+        """The URI for this :class:`.Namespace`'s template.
-+
-+        I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
-+
-+        This is the equivalent of :attr:`.Template.uri`.
-+
-+        """
-+        return self.template.uri
-+
-+    def _get_star(self):
-+        if self.callables:
-+            for key in self.callables:
-+                yield (key, self.callables[key])
-+        def get(key):
-+            callable_ = self.template._get_def_callable(key)
-+            return compat.partial(callable_, self.context)
-+        for k in self.template.module._exports:
-+            yield (k, get(k))
-+
-+    def __getattr__(self, key):
-+        if key in self.callables:
-+            val = self.callables[key]
-+        elif self.template.has_def(key):
-+            callable_ = self.template._get_def_callable(key)
-+            val = compat.partial(callable_, self.context)
-+        elif self.inherits:
-+            val = getattr(self.inherits, key)
-+
-+        else:
-+            raise AttributeError(
-+                    "Namespace '%s' has no member '%s'" %
-+                    (self.name, key))
-+        setattr(self, key, val)
-+        return val
-+
-+class ModuleNamespace(Namespace):
-+    """A :class:`.Namespace` specific to a Python module instance."""
-+
-+    def __init__(self, name, context, module,
-+                            callables=None, inherits=None,
-+                            populate_self=True, calling_uri=None):
-+        self.name = name
-+        self.context = context
-+        self.inherits = inherits
-+        if callables is not None:
-+            self.callables = dict([(c.__name__, c) for c in callables])
-+
-+        mod = __import__(module)
-+        for token in module.split('.')[1:]:
-+            mod = getattr(mod, token)
-+        self.module = mod
-+
-+    @property
-+    def filename(self):
-+        """The path of the filesystem file used for this
-+        :class:`.Namespace`'s module or template.
-+        """
-+        return self.module.__file__
-+
-+    def _get_star(self):
-+        if self.callables:
-+            for key in self.callables:
-+                yield (key, self.callables[key])
-+        for key in dir(self.module):
-+            if key[0] != '_':
-+                callable_ = getattr(self.module, key)
-+                if compat.callable(callable_):
-+                    yield key, compat.partial(callable_, self.context)
-+
-+
-+    def __getattr__(self, key):
-+        if key in self.callables:
-+            val = self.callables[key]
-+        elif hasattr(self.module, key):
-+            callable_ = getattr(self.module, key)
-+            val = compat.partial(callable_, self.context)
-+        elif self.inherits:
-+            val = getattr(self.inherits, key)
-+        else:
-+            raise AttributeError(
-+                    "Namespace '%s' has no member '%s'" %
-+                    (self.name, key))
-+        setattr(self, key, val)
-+        return val
-+
-+def supports_caller(func):
-+    """Apply a caller_stack compatibility decorator to a plain
-+    Python function.
-+
-+    See the example in :ref:`namespaces_python_modules`.
-+
-+    """
-+
-+    def wrap_stackframe(context, *args, **kwargs):
-+        context.caller_stack._push_frame()
-+        try:
-+            return func(context, *args, **kwargs)
-+        finally:
-+            context.caller_stack._pop_frame()
-+    return wrap_stackframe
-+
-+def capture(context, callable_, *args, **kwargs):
-+    """Execute the given template def, capturing the output into
-+    a buffer.
-+
-+    See the example in :ref:`namespaces_python_modules`.
-+
-+    """
-+
-+    if not compat.callable(callable_):
-+        raise exceptions.RuntimeException(
-+                        "capture() function expects a callable as "
-+                        "its argument (i.e. capture(func, *args, **kwargs))"
-+                        )
-+    context._push_buffer()
-+    try:
-+        callable_(*args, **kwargs)
-+    finally:
-+        buf = context._pop_buffer()
-+    return buf.getvalue()
-+
-+def _decorate_toplevel(fn):
-+    def decorate_render(render_fn):
-+        def go(context, *args, **kw):
-+            def y(*args, **kw):
-+                return render_fn(context, *args, **kw)
-+            try:
-+                y.__name__ = render_fn.__name__[7:]
-+            except TypeError:
-+                # < Python 2.4
-+                pass
-+            return fn(y)(context, *args, **kw)
-+        return go
-+    return decorate_render
-+
-+def _decorate_inline(context, fn):
-+    def decorate_render(render_fn):
-+        dec = fn(render_fn)
-+        def go(*args, **kw):
-+            return dec(context, *args, **kw)
-+        return go
-+    return decorate_render
-+
-+def _include_file(context, uri, calling_uri, **kwargs):
-+    """locate the template from the given uri and include it in
-+    the current output."""
-+
-+    template = _lookup_template(context, uri, calling_uri)
-+    (callable_, ctx) = _populate_self_namespace(
-+                                context._clean_inheritance_tokens(),
-+                                template)
-+    callable_(ctx, **_kwargs_for_include(callable_, context._data, **kwargs))
-+
-+def _inherit_from(context, uri, calling_uri):
-+    """called by the _inherit method in template modules to set
-+    up the inheritance chain at the start of a template's
-+    execution."""
-+
-+    if uri is None:
-+        return None
-+    template = _lookup_template(context, uri, calling_uri)
-+    self_ns = context['self']
-+    ih = self_ns
-+    while ih.inherits is not None:
-+        ih = ih.inherits
-+    lclcontext = context._locals({'next': ih})
-+    ih.inherits = TemplateNamespace("self:%s" % template.uri,
-+                                lclcontext,
-+                                template=template,
-+                                populate_self=False)
-+    context._data['parent'] = lclcontext._data['local'] = ih.inherits
-+    callable_ = getattr(template.module, '_mako_inherit', None)
-+    if callable_ is not None:
-+        ret = callable_(template, lclcontext)
-+        if ret:
-+            return ret
-+
-+    gen_ns = getattr(template.module, '_mako_generate_namespaces', None)
-+    if gen_ns is not None:
-+        gen_ns(context)
-+    return (template.callable_, lclcontext)
-+
-+def _lookup_template(context, uri, relativeto):
-+    lookup = context._with_template.lookup
-+    if lookup is None:
-+        raise exceptions.TemplateLookupException(
-+                            "Template '%s' has no TemplateLookup associated" %
-+                            context._with_template.uri)
-+    uri = lookup.adjust_uri(uri, relativeto)
-+    try:
-+        return lookup.get_template(uri)
-+    except exceptions.TopLevelLookupException:
-+        raise exceptions.TemplateLookupException(str(compat.exception_as()))
-+
-+def _populate_self_namespace(context, template, self_ns=None):
-+    if self_ns is None:
-+        self_ns = TemplateNamespace('self:%s' % template.uri,
-+                                context, template=template,
-+                                populate_self=False)
-+    context._data['self'] = context._data['local'] = self_ns
-+    if hasattr(template.module, '_mako_inherit'):
-+        ret = template.module._mako_inherit(template, context)
-+        if ret:
-+            return ret
-+    return (template.callable_, context)
-+
-+def _render(template, callable_, args, data, as_unicode=False):
-+    """create a Context and return the string
-+    output of the given template and template callable."""
-+
-+    if as_unicode:
-+        buf = util.FastEncodingBuffer(as_unicode=True)
-+    elif template.bytestring_passthrough:
-+        buf = compat.StringIO()
-+    else:
-+        buf = util.FastEncodingBuffer(
-+                        as_unicode=as_unicode,
-+                        encoding=template.output_encoding,
-+                        errors=template.encoding_errors)
-+    context = Context(buf, **data)
-+    context._outputting_as_unicode = as_unicode
-+    context._set_with_template(template)
-+
-+    _render_context(template, callable_, context, *args,
-+                            **_kwargs_for_callable(callable_, data))
-+    return context._pop_buffer().getvalue()
-+
-+def _kwargs_for_callable(callable_, data):
-+    argspec = compat.inspect_func_args(callable_)
-+    # for normal pages, **pageargs is usually present
-+    if argspec[2]:
-+        return data
-+
-+    # for rendering defs from the top level, figure out the args
-+    namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
-+    kwargs = {}
-+    for arg in namedargs:
-+        if arg != 'context' and arg in data and arg not in kwargs:
-+            kwargs[arg] = data[arg]
-+    return kwargs
-+
-+def _kwargs_for_include(callable_, data, **kwargs):
-+    argspec = compat.inspect_func_args(callable_)
-+    namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
-+    for arg in namedargs:
-+        if arg != 'context' and arg in data and arg not in kwargs:
-+            kwargs[arg] = data[arg]
-+    return kwargs
-+
-+def _render_context(tmpl, callable_, context, *args, **kwargs):
-+    import mako.template as template
-+    # create polymorphic 'self' namespace for this
-+    # template with possibly updated context
-+    if not isinstance(tmpl, template.DefTemplate):
-+        # if main render method, call from the base of the inheritance stack
-+        (inherit, lclcontext) = _populate_self_namespace(context, tmpl)
-+        _exec_template(inherit, lclcontext, args=args, kwargs=kwargs)
-+    else:
-+        # otherwise, call the actual rendering method specified
-+        (inherit, lclcontext) = _populate_self_namespace(context, tmpl.parent)
-+        _exec_template(callable_, context, args=args, kwargs=kwargs)
-+
-+def _exec_template(callable_, context, args=None, kwargs=None):
-+    """execute a rendering callable given the callable, a
-+    Context, and optional explicit arguments
-+
-+    the contextual Template will be located if it exists, and
-+    the error handling options specified on that Template will
-+    be interpreted here.
-+    """
-+    template = context._with_template
-+    if template is not None and \
-+            (template.format_exceptions or template.error_handler):
-+        try:
-+            callable_(context, *args, **kwargs)
-+        except Exception:
-+            _render_error(template, context, compat.exception_as())
-+        except:
-+            e = sys.exc_info()[0]
-+            _render_error(template, context, e)
-+    else:
-+        callable_(context, *args, **kwargs)
-+
-+def _render_error(template, context, error):
-+    if template.error_handler:
-+        result = template.error_handler(context, error)
-+        if not result:
-+            compat.reraise(*sys.exc_info())
-+    else:
-+        error_template = exceptions.html_error_template()
-+        if context._outputting_as_unicode:
-+            context._buffer_stack[:] = [
-+                                    util.FastEncodingBuffer(as_unicode=True)]
-+        else:
-+            context._buffer_stack[:] = [util.FastEncodingBuffer(
-+                                            error_template.output_encoding,
-+                                            error_template.encoding_errors)]
-+
-+        context._set_with_template(error_template)
-+        error_template.render_context(context, error=error)
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py
-new file mode 100644
-index 0000000..fb61062
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py
-@@ -0,0 +1,705 @@
-+# mako/template.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+"""Provides the Template class, a facade for parsing, generating and executing
-+template strings, as well as template runtime operations."""
-+
-+from mako.lexer import Lexer
-+from mako import runtime, util, exceptions, codegen, cache, compat
-+import os
-+import re
-+import shutil
-+import stat
-+import sys
-+import tempfile
-+import types
-+import weakref
-+
-+
-+class Template(object):
-+    """Represents a compiled template.
-+
-+    :class:`.Template` includes a reference to the original
-+    template source (via the :attr:`.source` attribute)
-+    as well as the source code of the
-+    generated Python module (i.e. the :attr:`.code` attribute),
-+    as well as a reference to an actual Python module.
-+
-+    :class:`.Template` is constructed using either a literal string
-+    representing the template text, or a filename representing a filesystem
-+    path to a source file.
-+
-+    :param text: textual template source.  This argument is mutually
-+     exclusive versus the ``filename`` parameter.
-+
-+    :param filename: filename of the source template.  This argument is
-+     mutually exclusive versus the ``text`` parameter.
-+
-+    :param buffer_filters: string list of filters to be applied
-+     to the output of ``%def``\ s which are buffered, cached, or otherwise
-+     filtered, after all filters
-+     defined with the ``%def`` itself have been applied. Allows the
-+     creation of default expression filters that let the output
-+     of return-valued ``%def``\ s "opt out" of that filtering via
-+     passing special attributes or objects.
-+
-+    :param bytestring_passthrough: When ``True``, and ``output_encoding`` is
-+     set to ``None``, and :meth:`.Template.render` is used to render,
-+     the `StringIO` or `cStringIO` buffer will be used instead of the
-+     default "fast" buffer.   This allows raw bytestrings in the
-+     output stream, such as in expressions, to pass straight
-+     through to the buffer.  This flag is forced
-+     to ``True`` if ``disable_unicode`` is also configured.
-+
-+     .. versionadded:: 0.4
-+        Added to provide the same behavior as that of the previous series.
-+
-+    :param cache_args: Dictionary of cache configuration arguments that
-+     will be passed to the :class:`.CacheImpl`.   See :ref:`caching_toplevel`.
-+
-+    :param cache_dir:
-+
-+     .. deprecated:: 0.6
-+        Use the ``'dir'`` argument in the ``cache_args`` dictionary.
-+        See :ref:`caching_toplevel`.
-+
-+    :param cache_enabled: Boolean flag which enables caching of this
-+     template.  See :ref:`caching_toplevel`.
-+
-+    :param cache_impl: String name of a :class:`.CacheImpl` caching
-+     implementation to use.   Defaults to ``'beaker'``.
-+
-+    :param cache_type:
-+
-+     .. deprecated:: 0.6
-+        Use the ``'type'`` argument in the ``cache_args`` dictionary.
-+        See :ref:`caching_toplevel`.
-+
-+    :param cache_url:
-+
-+     .. deprecated:: 0.6
-+        Use the ``'url'`` argument in the ``cache_args`` dictionary.
-+        See :ref:`caching_toplevel`.
-+
-+    :param default_filters: List of string filter names that will
-+     be applied to all expressions.  See :ref:`filtering_default_filters`.
-+
-+    :param disable_unicode: Disables all awareness of Python Unicode
-+     objects.  See :ref:`unicode_disabled`.
-+
-+    :param enable_loop: When ``True``, enable the ``loop`` context variable.
-+     This can be set to ``False`` to support templates that may
-+     be making usage of the name "``loop``".   Individual templates can
-+     re-enable the "loop" context by placing the directive
-+     ``enable_loop="True"`` inside the ``<%page>`` tag -- see
-+     :ref:`migrating_loop`.
-+
-+    :param encoding_errors: Error parameter passed to ``encode()`` when
-+     string encoding is performed. See :ref:`usage_unicode`.
-+
-+    :param error_handler: Python callable which is called whenever
-+     compile or runtime exceptions occur. The callable is passed
-+     the current context as well as the exception. If the
-+     callable returns ``True``, the exception is considered to
-+     be handled, else it is re-raised after the function
-+     completes. Is used to provide custom error-rendering
-+     functions.
-+
-+    :param format_exceptions: if ``True``, exceptions which occur during
-+     the render phase of this template will be caught and
-+     formatted into an HTML error page, which then becomes the
-+     rendered result of the :meth:`.render` call. Otherwise,
-+     runtime exceptions are propagated outwards.
-+
-+    :param imports: String list of Python statements, typically individual
-+     "import" lines, which will be placed into the module level
-+     preamble of all generated Python modules. See the example
-+     in :ref:`filtering_default_filters`.
-+
-+    :param future_imports: String list of names to import from `__future__`.
-+     These will be concatenated into a comma-separated string and inserted
-+     into the beginning of the template, e.g. ``futures_imports=['FOO',
-+     'BAR']`` results in ``from __future__ import FOO, BAR``.  If you're
-+     interested in using features like the new division operator, you must
-+     use future_imports to convey that to the renderer, as otherwise the
-+     import will not appear as the first executed statement in the generated
-+     code and will therefore not have the desired effect.
-+
-+    :param input_encoding: Encoding of the template's source code.  Can
-+     be used in lieu of the coding comment. See
-+     :ref:`usage_unicode` as well as :ref:`unicode_toplevel` for
-+     details on source encoding.
-+
-+    :param lookup: a :class:`.TemplateLookup` instance that will be used
-+     for all file lookups via the ``<%namespace>``,
-+     ``<%include>``, and ``<%inherit>`` tags. See
-+     :ref:`usage_templatelookup`.
-+
-+    :param module_directory: Filesystem location where generated
-+     Python module files will be placed.
-+
-+    :param module_filename: Overrides the filename of the generated
-+     Python module file. For advanced usage only.
-+
-+    :param module_writer: A callable which overrides how the Python
-+     module is written entirely.  The callable is passed the
-+     encoded source content of the module and the destination
-+     path to be written to.   The default behavior of module writing
-+     uses a tempfile in conjunction with a file move in order
-+     to make the operation atomic.   So a user-defined module
-+     writing function that mimics the default behavior would be:
-+
-+     .. sourcecode:: python
-+
-+         import tempfile
-+         import os
-+         import shutil
-+
-+         def module_writer(source, outputpath):
-+             (dest, name) = \\
-+                 tempfile.mkstemp(
-+                     dir=os.path.dirname(outputpath)
-+                 )
-+
-+             os.write(dest, source)
-+             os.close(dest)
-+             shutil.move(name, outputpath)
-+
-+         from mako.template import Template
-+         mytemplate = Template(
-+                         filename="index.html",
-+                         module_directory="/path/to/modules",
-+                         module_writer=module_writer
-+                     )
-+
-+     The function is provided for unusual configurations where
-+     certain platform-specific permissions or other special
-+     steps are needed.
-+
-+    :param output_encoding: The encoding to use when :meth:`.render`
-+     is called.
-+     See :ref:`usage_unicode` as well as :ref:`unicode_toplevel`.
-+
-+    :param preprocessor: Python callable which will be passed
-+     the full template source before it is parsed. The return
-+     result of the callable will be used as the template source
-+     code.
-+
-+    :param lexer_cls: A :class:`.Lexer` class used to parse
-+     the template.   The :class:`.Lexer` class is used by
-+     default.
-+
-+     .. versionadded:: 0.7.4
-+
-+    :param strict_undefined: Replaces the automatic usage of
-+     ``UNDEFINED`` for any undeclared variables not located in
-+     the :class:`.Context` with an immediate raise of
-+     ``NameError``. The advantage is immediate reporting of
-+     missing variables which include the name.
-+
-+     .. versionadded:: 0.3.6
-+
-+    :param uri: string URI or other identifier for this template.
-+     If not provided, the ``uri`` is generated from the filesystem
-+     path, or from the in-memory identity of a non-file-based
-+     template. The primary usage of the ``uri`` is to provide a key
-+     within :class:`.TemplateLookup`, as well as to generate the
-+     file path of the generated Python module file, if
-+     ``module_directory`` is specified.
-+
-+    """
-+
-+    lexer_cls = Lexer
-+
-+    def __init__(self,
-+                    text=None,
-+                    filename=None,
-+                    uri=None,
-+                    format_exceptions=False,
-+                    error_handler=None,
-+                    lookup=None,
-+                    output_encoding=None,
-+                    encoding_errors='strict',
-+                    module_directory=None,
-+                    cache_args=None,
-+                    cache_impl='beaker',
-+                    cache_enabled=True,
-+                    cache_type=None,
-+                    cache_dir=None,
-+                    cache_url=None,
-+                    module_filename=None,
-+                    input_encoding=None,
-+                    disable_unicode=False,
-+                    module_writer=None,
-+                    bytestring_passthrough=False,
-+                    default_filters=None,
-+                    buffer_filters=(),
-+                    strict_undefined=False,
-+                    imports=None,
-+                    future_imports=None,
-+                    enable_loop=True,
-+                    preprocessor=None,
-+                    lexer_cls=None):
-+        if uri:
-+            self.module_id = re.sub(r'\W', "_", uri)
-+            self.uri = uri
-+        elif filename:
-+            self.module_id = re.sub(r'\W', "_", filename)
-+            drive, path = os.path.splitdrive(filename)
-+            path = os.path.normpath(path).replace(os.path.sep, "/")
-+            self.uri = path
-+        else:
-+            self.module_id = "memory:" + hex(id(self))
-+            self.uri = self.module_id
-+
-+        u_norm = self.uri
-+        if u_norm.startswith("/"):
-+            u_norm = u_norm[1:]
-+        u_norm = os.path.normpath(u_norm)
-+        if u_norm.startswith(".."):
-+            raise exceptions.TemplateLookupException(
-+                    "Template uri \"%s\" is invalid - "
-+                    "it cannot be relative outside "
-+                    "of the root path." % self.uri)
-+
-+        self.input_encoding = input_encoding
-+        self.output_encoding = output_encoding
-+        self.encoding_errors = encoding_errors
-+        self.disable_unicode = disable_unicode
-+        self.bytestring_passthrough = bytestring_passthrough or disable_unicode
-+        self.enable_loop = enable_loop
-+        self.strict_undefined = strict_undefined
-+        self.module_writer = module_writer
-+
-+        if compat.py3k and disable_unicode:
-+            raise exceptions.UnsupportedError(
-+                                    "Mako for Python 3 does not "
-+                                    "support disabling Unicode")
-+        elif output_encoding and disable_unicode:
-+            raise exceptions.UnsupportedError(
-+                                    "output_encoding must be set to "
-+                                    "None when disable_unicode is used.")
-+        if default_filters is None:
-+            if compat.py3k or self.disable_unicode:
-+                self.default_filters = ['str']
-+            else:
-+                self.default_filters = ['unicode']
-+        else:
-+            self.default_filters = default_filters
-+        self.buffer_filters = buffer_filters
-+
-+        self.imports = imports
-+        self.future_imports = future_imports
-+        self.preprocessor = preprocessor
-+
-+        if lexer_cls is not None:
-+            self.lexer_cls = lexer_cls
-+
-+        # if plain text, compile code in memory only
-+        if text is not None:
-+            (code, module) = _compile_text(self, text, filename)
-+            self._code = code
-+            self._source = text
-+            ModuleInfo(module, None, self, filename, code, text)
-+        elif filename is not None:
-+            # if template filename and a module directory, load
-+            # a filesystem-based module file, generating if needed
-+            if module_filename is not None:
-+                path = module_filename
-+            elif module_directory is not None:
-+                path = os.path.abspath(
-+                        os.path.join(
-+                            os.path.normpath(module_directory),
-+                            u_norm + ".py"
-+                            )
-+                        )
-+            else:
-+                path = None
-+            module = self._compile_from_file(path, filename)
-+        else:
-+            raise exceptions.RuntimeException(
-+                                "Template requires text or filename")
-+
-+        self.module = module
-+        self.filename = filename
-+        self.callable_ = self.module.render_body
-+        self.format_exceptions = format_exceptions
-+        self.error_handler = error_handler
-+        self.lookup = lookup
-+
-+        self.module_directory = module_directory
-+
-+        self._setup_cache_args(
-+            cache_impl, cache_enabled, cache_args,
-+            cache_type, cache_dir, cache_url
-+        )
-+
-+
-+    @util.memoized_property
-+    def reserved_names(self):
-+        if self.enable_loop:
-+            return codegen.RESERVED_NAMES
-+        else:
-+            return codegen.RESERVED_NAMES.difference(['loop'])
-+
-+    def _setup_cache_args(self,
-+                cache_impl, cache_enabled, cache_args,
-+                cache_type, cache_dir, cache_url):
-+        self.cache_impl = cache_impl
-+        self.cache_enabled = cache_enabled
-+        if cache_args:
-+            self.cache_args = cache_args
-+        else:
-+            self.cache_args = {}
-+
-+        # transfer deprecated cache_* args
-+        if cache_type:
-+            self.cache_args['type'] = cache_type
-+        if cache_dir:
-+            self.cache_args['dir'] = cache_dir
-+        if cache_url:
-+            self.cache_args['url'] = cache_url
-+
-+    def _compile_from_file(self, path, filename):
-+        if path is not None:
-+            util.verify_directory(os.path.dirname(path))
-+            filemtime = os.stat(filename)[stat.ST_MTIME]
-+            if not os.path.exists(path) or \
-+                        os.stat(path)[stat.ST_MTIME] < filemtime:
-+                data = util.read_file(filename)
-+                _compile_module_file(
-+                            self,
-+                            data,
-+                            filename,
-+                            path,
-+                            self.module_writer)
-+            module = compat.load_module(self.module_id, path)
-+            del sys.modules[self.module_id]
-+            if module._magic_number != codegen.MAGIC_NUMBER:
-+                data = util.read_file(filename)
-+                _compile_module_file(
-+                            self,
-+                            data,
-+                            filename,
-+                            path,
-+                            self.module_writer)
-+                module = compat.load_module(self.module_id, path)
-+                del sys.modules[self.module_id]
-+            ModuleInfo(module, path, self, filename, None, None)
-+        else:
-+            # template filename and no module directory, compile code
-+            # in memory
-+            data = util.read_file(filename)
-+            code, module = _compile_text(
-+                                self,
-+                                data,
-+                                filename)
-+            self._source = None
-+            self._code = code
-+            ModuleInfo(module, None, self, filename, code, None)
-+        return module
-+
-+    @property
-+    def source(self):
-+        """Return the template source code for this :class:`.Template`."""
-+
-+        return _get_module_info_from_callable(self.callable_).source
-+
-+    @property
-+    def code(self):
-+        """Return the module source code for this :class:`.Template`."""
-+
-+        return _get_module_info_from_callable(self.callable_).code
-+
-+    @util.memoized_property
-+    def cache(self):
-+        return cache.Cache(self)
-+
-+    @property
-+    def cache_dir(self):
-+        return self.cache_args['dir']
-+    @property
-+    def cache_url(self):
-+        return self.cache_args['url']
-+    @property
-+    def cache_type(self):
-+        return self.cache_args['type']
-+
-+    def render(self, *args, **data):
-+        """Render the output of this template as a string.
-+
-+        If the template specifies an output encoding, the string
-+        will be encoded accordingly, else the output is raw (raw
-+        output uses `cStringIO` and can't handle multibyte
-+        characters). A :class:`.Context` object is created corresponding
-+        to the given data. Arguments that are explicitly declared
-+        by this template's internal rendering method are also
-+        pulled from the given ``*args``, ``**data`` members.
-+
-+        """
-+        return runtime._render(self, self.callable_, args, data)
-+
-+    def render_unicode(self, *args, **data):
-+        """Render the output of this template as a unicode object."""
-+
-+        return runtime._render(self,
-+                                self.callable_,
-+                                args,
-+                                data,
-+                                as_unicode=True)
-+
-+    def render_context(self, context, *args, **kwargs):
-+        """Render this :class:`.Template` with the given context.
-+
-+        The data is written to the context's buffer.
-+
-+        """
-+        if getattr(context, '_with_template', None) is None:
-+            context._set_with_template(self)
-+        runtime._render_context(self,
-+                                self.callable_,
-+                                context,
-+                                *args,
-+                                **kwargs)
-+
-+    def has_def(self, name):
-+        return hasattr(self.module, "render_%s" % name)
-+
-+    def get_def(self, name):
-+        """Return a def of this template as a :class:`.DefTemplate`."""
-+
-+        return DefTemplate(self, getattr(self.module, "render_%s" % name))
-+
-+    def _get_def_callable(self, name):
-+        return getattr(self.module, "render_%s" % name)
-+
-+    @property
-+    def last_modified(self):
-+        return self.module._modified_time
-+
-+class ModuleTemplate(Template):
-+    """A Template which is constructed given an existing Python module.
-+
-+        e.g.::
-+
-+        t = Template("this is a template")
-+        f = file("mymodule.py", "w")
-+        f.write(t.code)
-+        f.close()
-+
-+        import mymodule
-+
-+        t = ModuleTemplate(mymodule)
-+        print t.render()
-+
-+    """
-+
-+    def __init__(self, module,
-+                        module_filename=None,
-+                        template=None,
-+                        template_filename=None,
-+                        module_source=None,
-+                        template_source=None,
-+                        output_encoding=None,
-+                        encoding_errors='strict',
-+                        disable_unicode=False,
-+                        bytestring_passthrough=False,
-+                        format_exceptions=False,
-+                        error_handler=None,
-+                        lookup=None,
-+                        cache_args=None,
-+                        cache_impl='beaker',
-+                        cache_enabled=True,
-+                        cache_type=None,
-+                        cache_dir=None,
-+                        cache_url=None,
-+    ):
-+        self.module_id = re.sub(r'\W', "_", module._template_uri)
-+        self.uri = module._template_uri
-+        self.input_encoding = module._source_encoding
-+        self.output_encoding = output_encoding
-+        self.encoding_errors = encoding_errors
-+        self.disable_unicode = disable_unicode
-+        self.bytestring_passthrough = bytestring_passthrough or disable_unicode
-+        self.enable_loop = module._enable_loop
-+
-+        if compat.py3k and disable_unicode:
-+            raise exceptions.UnsupportedError(
-+                                    "Mako for Python 3 does not "
-+                                    "support disabling Unicode")
-+        elif output_encoding and disable_unicode:
-+            raise exceptions.UnsupportedError(
-+                                    "output_encoding must be set to "
-+                                    "None when disable_unicode is used.")
-+
-+        self.module = module
-+        self.filename = template_filename
-+        ModuleInfo(module,
-+                        module_filename,
-+                        self,
-+                        template_filename,
-+                        module_source,
-+                        template_source)
-+
-+        self.callable_ = self.module.render_body
-+        self.format_exceptions = format_exceptions
-+        self.error_handler = error_handler
-+        self.lookup = lookup
-+        self._setup_cache_args(
-+            cache_impl, cache_enabled, cache_args,
-+            cache_type, cache_dir, cache_url
-+        )
-+
-+class DefTemplate(Template):
-+    """A :class:`.Template` which represents a callable def in a parent
-+    template."""
-+
-+    def __init__(self, parent, callable_):
-+        self.parent = parent
-+        self.callable_ = callable_
-+        self.output_encoding = parent.output_encoding
-+        self.module = parent.module
-+        self.encoding_errors = parent.encoding_errors
-+        self.format_exceptions = parent.format_exceptions
-+        self.error_handler = parent.error_handler
-+        self.enable_loop = parent.enable_loop
-+        self.lookup = parent.lookup
-+        self.bytestring_passthrough = parent.bytestring_passthrough
-+
-+    def get_def(self, name):
-+        return self.parent.get_def(name)
-+
-+class ModuleInfo(object):
-+    """Stores information about a module currently loaded into
-+    memory, provides reverse lookups of template source, module
-+    source code based on a module's identifier.
-+
-+     """
-+    _modules = weakref.WeakValueDictionary()
-+
-+    def __init__(self,
-+                    module,
-+                    module_filename,
-+                    template,
-+                    template_filename,
-+                    module_source,
-+                    template_source):
-+        self.module = module
-+        self.module_filename = module_filename
-+        self.template_filename = template_filename
-+        self.module_source = module_source
-+        self.template_source = template_source
-+        self._modules[module.__name__] = template._mmarker = self
-+        if module_filename:
-+            self._modules[module_filename] = self
-+
-+    @classmethod
-+    def get_module_source_metadata(cls, module_source, full_line_map=False):
-+        source_map = re.search(
-+                        r"__M_BEGIN_METADATA(.+?)__M_END_METADATA",
-+                        module_source, re.S).group(1)
-+        source_map = compat.json.loads(source_map)
-+        source_map['line_map'] = dict((int(k), int(v))
-+                                    for k, v in source_map['line_map'].items())
-+        if full_line_map:
-+            f_line_map = source_map['full_line_map'] = []
-+            line_map = source_map['line_map']
-+
-+            curr_templ_line = 1
-+            for mod_line in range(1, max(line_map)):
-+                if mod_line in line_map:
-+                    curr_templ_line = line_map[mod_line]
-+                f_line_map.append(curr_templ_line)
-+        return source_map
-+
-+    @property
-+    def code(self):
-+        if self.module_source is not None:
-+            return self.module_source
-+        else:
-+            return util.read_python_file(self.module_filename)
-+
-+    @property
-+    def source(self):
-+        if self.template_source is not None:
-+            if self.module._source_encoding and \
-+                    not isinstance(self.template_source, compat.text_type):
-+                return self.template_source.decode(
-+                                self.module._source_encoding)
-+            else:
-+                return self.template_source
-+        else:
-+            data = util.read_file(self.template_filename)
-+            if self.module._source_encoding:
-+                return data.decode(self.module._source_encoding)
-+            else:
-+                return data
-+
-+def _compile(template, text, filename, generate_magic_comment):
-+    lexer = template.lexer_cls(text,
-+                           filename,
-+                           disable_unicode=template.disable_unicode,
-+                           input_encoding=template.input_encoding,
-+                           preprocessor=template.preprocessor)
-+    node = lexer.parse()
-+    source = codegen.compile(node,
-+                            template.uri,
-+                            filename,
-+                            default_filters=template.default_filters,
-+                            buffer_filters=template.buffer_filters,
-+                            imports=template.imports,
-+                            future_imports=template.future_imports,
-+                            source_encoding=lexer.encoding,
-+                            generate_magic_comment=generate_magic_comment,
-+                            disable_unicode=template.disable_unicode,
-+                            strict_undefined=template.strict_undefined,
-+                            enable_loop=template.enable_loop,
-+                            reserved_names=template.reserved_names)
-+    return source, lexer
-+
-+def _compile_text(template, text, filename):
-+    identifier = template.module_id
-+    source, lexer = _compile(template, text, filename,
-+                        generate_magic_comment=template.disable_unicode)
-+
-+    cid = identifier
-+    if not compat.py3k and isinstance(cid, compat.text_type):
-+        cid = cid.encode()
-+    module = types.ModuleType(cid)
-+    code = compile(source, cid, 'exec')
-+
-+    # this exec() works for 2.4->3.3.
-+    exec(code, module.__dict__, module.__dict__)
-+    return (source, module)
-+
-+def _compile_module_file(template, text, filename, outputpath, module_writer):
-+    source, lexer = _compile(template, text, filename,
-+                        generate_magic_comment=True)
-+
-+    if isinstance(source, compat.text_type):
-+        source = source.encode(lexer.encoding or 'ascii')
-+
-+    if module_writer:
-+        module_writer(source, outputpath)
-+    else:
-+        # make tempfiles in the same location as the ultimate
-+        # location.   this ensures they're on the same filesystem,
-+        # avoiding synchronization issues.
-+        (dest, name) = tempfile.mkstemp(dir=os.path.dirname(outputpath))
-+
-+        os.write(dest, source)
-+        os.close(dest)
-+        shutil.move(name, outputpath)
-+
-+def _get_module_info_from_callable(callable_):
-+    if compat.py3k:
-+        return _get_module_info(callable_.__globals__['__name__'])
-+    else:
-+        return _get_module_info(callable_.func_globals['__name__'])
-+
-+def _get_module_info(filename):
-+    return ModuleInfo._modules[filename]
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py
-new file mode 100644
-index 0000000..cba2ab7
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py
-@@ -0,0 +1,360 @@
-+# mako/util.py
-+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
-+#
-+# This module is part of Mako and is released under
-+# the MIT License: http://www.opensource.org/licenses/mit-license.php
-+
-+import re
-+import collections
-+import codecs
-+import os
-+from mako import compat
-+import operator
-+
-+def update_wrapper(decorated, fn):
-+    decorated.__wrapped__ = fn
-+    decorated.__name__ = fn.__name__
-+    return decorated
-+
-+
-+class PluginLoader(object):
-+    def __init__(self, group):
-+        self.group = group
-+        self.impls = {}
-+
-+    def load(self, name):
-+        if name in self.impls:
-+            return self.impls[name]()
-+        else:
-+            import pkg_resources
-+            for impl in pkg_resources.iter_entry_points(
-+                                self.group,
-+                                name):
-+                self.impls[name] = impl.load
-+                return impl.load()
-+            else:
-+                from mako import exceptions
-+                raise exceptions.RuntimeException(
-+                        "Can't load plugin %s %s" %
-+                        (self.group, name))
-+
-+    def register(self, name, modulepath, objname):
-+        def load():
-+            mod = __import__(modulepath)
-+            for token in modulepath.split(".")[1:]:
-+                mod = getattr(mod, token)
-+            return getattr(mod, objname)
-+        self.impls[name] = load
-+
-+def verify_directory(dir):
-+    """create and/or verify a filesystem directory."""
-+
-+    tries = 0
-+
-+    while not os.path.exists(dir):
-+        try:
-+            tries += 1
-+            os.makedirs(dir, compat.octal("0775"))
-+        except:
-+            if tries > 5:
-+                raise
-+
-+def to_list(x, default=None):
-+    if x is None:
-+        return default
-+    if not isinstance(x, (list, tuple)):
-+        return [x]
-+    else:
-+        return x
-+
-+
-+class memoized_property(object):
-+    """A read-only @property that is only evaluated once."""
-+    def __init__(self, fget, doc=None):
-+        self.fget = fget
-+        self.__doc__ = doc or fget.__doc__
-+        self.__name__ = fget.__name__
-+
-+    def __get__(self, obj, cls):
-+        if obj is None:
-+            return self
-+        obj.__dict__[self.__name__] = result = self.fget(obj)
-+        return result
-+
-+class memoized_instancemethod(object):
-+    """Decorate a method memoize its return value.
-+
-+    Best applied to no-arg methods: memoization is not sensitive to
-+    argument values, and will always return the same value even when
-+    called with different arguments.
-+
-+    """
-+    def __init__(self, fget, doc=None):
-+        self.fget = fget
-+        self.__doc__ = doc or fget.__doc__
-+        self.__name__ = fget.__name__
-+
-+    def __get__(self, obj, cls):
-+        if obj is None:
-+            return self
-+        def oneshot(*args, **kw):
-+            result = self.fget(obj, *args, **kw)
-+            memo = lambda *a, **kw: result
-+            memo.__name__ = self.__name__
-+            memo.__doc__ = self.__doc__
-+            obj.__dict__[self.__name__] = memo
-+            return result
-+        oneshot.__name__ = self.__name__
-+        oneshot.__doc__ = self.__doc__
-+        return oneshot
-+
-+class SetLikeDict(dict):
-+    """a dictionary that has some setlike methods on it"""
-+    def union(self, other):
-+        """produce a 'union' of this dict and another (at the key level).
-+
-+        values in the second dict take precedence over that of the first"""
-+        x = SetLikeDict(**self)
-+        x.update(other)
-+        return x
-+
-+class FastEncodingBuffer(object):
-+    """a very rudimentary buffer that is faster than StringIO,
-+    but doesn't crash on unicode data like cStringIO."""
-+
-+    def __init__(self, encoding=None, errors='strict', as_unicode=False):
-+        self.data = collections.deque()
-+        self.encoding = encoding
-+        if as_unicode:
-+            self.delim = compat.u('')
-+        else:
-+            self.delim = ''
-+        self.as_unicode = as_unicode
-+        self.errors = errors
-+        self.write = self.data.append
-+
-+    def truncate(self):
-+        self.data = collections.deque()
-+        self.write = self.data.append
-+
-+    def getvalue(self):
-+        if self.encoding:
-+            return self.delim.join(self.data).encode(self.encoding,
-+                                                     self.errors)
-+        else:
-+            return self.delim.join(self.data)
-+
-+class LRUCache(dict):
-+    """A dictionary-like object that stores a limited number of items,
-+    discarding lesser used items periodically.
-+
-+    this is a rewrite of LRUCache from Myghty to use a periodic timestamp-based
-+    paradigm so that synchronization is not really needed.  the size management
-+    is inexact.
-+    """
-+
-+    class _Item(object):
-+        def __init__(self, key, value):
-+            self.key = key
-+            self.value = value
-+            self.timestamp = compat.time_func()
-+        def __repr__(self):
-+            return repr(self.value)
-+
-+    def __init__(self, capacity, threshold=.5):
-+        self.capacity = capacity
-+        self.threshold = threshold
-+
-+    def __getitem__(self, key):
-+        item = dict.__getitem__(self, key)
-+        item.timestamp = compat.time_func()
-+        return item.value
-+
-+    def values(self):
-+        return [i.value for i in dict.values(self)]
-+
-+    def setdefault(self, key, value):
-+        if key in self:
-+            return self[key]
-+        else:
-+            self[key] = value
-+            return value
-+
-+    def __setitem__(self, key, value):
-+        item = dict.get(self, key)
-+        if item is None:
-+            item = self._Item(key, value)
-+            dict.__setitem__(self, key, item)
-+        else:
-+            item.value = value
-+        self._manage_size()
-+
-+    def _manage_size(self):
-+        while len(self) > self.capacity + self.capacity * self.threshold:
-+            bytime = sorted(dict.values(self),
-+                            key=operator.attrgetter('timestamp'), reverse=True)
-+            for item in bytime[self.capacity:]:
-+                try:
-+                    del self[item.key]
-+                except KeyError:
-+                    # if we couldn't find a key, most likely some other thread
-+                    # broke in on us. loop around and try again
-+                    break
-+
-+# Regexp to match python magic encoding line
-+_PYTHON_MAGIC_COMMENT_re = re.compile(
-+    r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)',
-+    re.VERBOSE)
-+
-+def parse_encoding(fp):
-+    """Deduce the encoding of a Python source file (binary mode) from magic
-+    comment.
-+
-+    It does this in the same way as the `Python interpreter`__
-+
-+    .. __: http://docs.python.org/ref/encodings.html
-+
-+    The ``fp`` argument should be a seekable file object in binary mode.
-+    """
-+    pos = fp.tell()
-+    fp.seek(0)
-+    try:
-+        line1 = fp.readline()
-+        has_bom = line1.startswith(codecs.BOM_UTF8)
-+        if has_bom:
-+            line1 = line1[len(codecs.BOM_UTF8):]
-+
-+        m = _PYTHON_MAGIC_COMMENT_re.match(line1.decode('ascii', 'ignore'))
-+        if not m:
-+            try:
-+                import parser
-+                parser.suite(line1.decode('ascii', 'ignore'))
-+            except (ImportError, SyntaxError):
-+                # Either it's a real syntax error, in which case the source
-+                # is not valid python source, or line2 is a continuation of
-+                # line1, in which case we don't want to scan line2 for a magic
-+                # comment.
-+                pass
-+            else:
-+                line2 = fp.readline()
-+                m = _PYTHON_MAGIC_COMMENT_re.match(
-+                                               line2.decode('ascii', 'ignore'))
-+
-+        if has_bom:
-+            if m:
-+                raise SyntaxError("python refuses to compile code with both a UTF8" \
-+                      " byte-order-mark and a magic encoding comment")
-+            return 'utf_8'
-+        elif m:
-+            return m.group(1)
-+        else:
-+            return None
-+    finally:
-+        fp.seek(pos)
-+
-+def sorted_dict_repr(d):
-+    """repr() a dictionary with the keys in order.
-+
-+    Used by the lexer unit test to compare parse trees based on strings.
-+
-+    """
-+    keys = list(d.keys())
-+    keys.sort()
-+    return "{" + ", ".join(["%r: %r" % (k, d[k]) for k in keys]) + "}"
-+
-+def restore__ast(_ast):
-+    """Attempt to restore the required classes to the _ast module if it
-+    appears to be missing them
-+    """
-+    if hasattr(_ast, 'AST'):
-+        return
-+    _ast.PyCF_ONLY_AST = 2 << 9
-+    m = compile("""\
-+def foo(): pass
-+class Bar(object): pass
-+if False: pass
-+baz = 'mako'
-+1 + 2 - 3 * 4 / 5
-+6 // 7 % 8 << 9 >> 10
-+11 & 12 ^ 13 | 14
-+15 and 16 or 17
-+-baz + (not +18) - ~17
-+baz and 'foo' or 'bar'
-+(mako is baz == baz) is not baz != mako
-+mako > baz < mako >= baz <= mako
-+mako in baz not in mako""", '<unknown>', 'exec', _ast.PyCF_ONLY_AST)
-+    _ast.Module = type(m)
-+
-+    for cls in _ast.Module.__mro__:
-+        if cls.__name__ == 'mod':
-+            _ast.mod = cls
-+        elif cls.__name__ == 'AST':
-+            _ast.AST = cls
-+
-+    _ast.FunctionDef = type(m.body[0])
-+    _ast.ClassDef = type(m.body[1])
-+    _ast.If = type(m.body[2])
-+
-+    _ast.Name = type(m.body[3].targets[0])
-+    _ast.Store = type(m.body[3].targets[0].ctx)
-+    _ast.Str = type(m.body[3].value)
-+
-+    _ast.Sub = type(m.body[4].value.op)
-+    _ast.Add = type(m.body[4].value.left.op)
-+    _ast.Div = type(m.body[4].value.right.op)
-+    _ast.Mult = type(m.body[4].value.right.left.op)
-+
-+    _ast.RShift = type(m.body[5].value.op)
-+    _ast.LShift = type(m.body[5].value.left.op)
-+    _ast.Mod = type(m.body[5].value.left.left.op)
-+    _ast.FloorDiv = type(m.body[5].value.left.left.left.op)
-+
-+    _ast.BitOr = type(m.body[6].value.op)
-+    _ast.BitXor = type(m.body[6].value.left.op)
-+    _ast.BitAnd = type(m.body[6].value.left.left.op)
-+
-+    _ast.Or = type(m.body[7].value.op)
-+    _ast.And = type(m.body[7].value.values[0].op)
-+
-+    _ast.Invert = type(m.body[8].value.right.op)
-+    _ast.Not = type(m.body[8].value.left.right.op)
-+    _ast.UAdd = type(m.body[8].value.left.right.operand.op)
-+    _ast.USub = type(m.body[8].value.left.left.op)
-+
-+    _ast.Or = type(m.body[9].value.op)
-+    _ast.And = type(m.body[9].value.values[0].op)
-+
-+    _ast.IsNot = type(m.body[10].value.ops[0])
-+    _ast.NotEq = type(m.body[10].value.ops[1])
-+    _ast.Is = type(m.body[10].value.left.ops[0])
-+    _ast.Eq = type(m.body[10].value.left.ops[1])
-+
-+    _ast.Gt = type(m.body[11].value.ops[0])
-+    _ast.Lt = type(m.body[11].value.ops[1])
-+    _ast.GtE = type(m.body[11].value.ops[2])
-+    _ast.LtE = type(m.body[11].value.ops[3])
-+
-+    _ast.In = type(m.body[12].value.ops[0])
-+    _ast.NotIn = type(m.body[12].value.ops[1])
-+
-+
-+
-+def read_file(path, mode='rb'):
-+    fp = open(path, mode)
-+    try:
-+        data = fp.read()
-+        return data
-+    finally:
-+        fp.close()
-+
-+def read_python_file(path):
-+    fp = open(path, "rb")
-+    try:
-+        encoding = parse_encoding(fp)
-+        data = fp.read()
-+        if encoding:
-+            data = data.decode(encoding)
-+        return data
-+    finally:
-+        fp.close()
-+
-diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
-new file mode 100644
-index 0000000..5fbba17
---- /dev/null
-+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
-@@ -0,0 +1,106 @@
-+/******************************************************************************
-+*
-+* Copyright 2015
-+* Intel Corporation
-+*
-+* Licensed under the Apache License, Version 2.0 (the "License");
-+* you may not use this file except in compliance with the License.
-+* You may obtain a copy of the License at
-+*
-+* http ://www.apache.org/licenses/LICENSE-2.0
-+*
-+* Unless required by applicable law or agreed to in writing, software
-+* distributed under the License is distributed on an "AS IS" BASIS,
-+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+* See the License for the specific language governing permissions and
-+* limitations under the License.
-+*
-+% if gen_header:
-+* @file ${filename}.h
-+% else:
-+* @file ${filename}.cpp
-+% endif 
-+*
-+* @brief Dynamic Knobs for Core.
-+*
-+* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
-+*
-+******************************************************************************/
-+%if gen_header:
-+#pragma once
-+
-+template <typename T>
-+struct Knob
-+{
-+    const   T&  Value() const               { return m_Value; }
-+    const   T&  Value(const T& newValue)    { m_Value = newValue; return Value(); }
-+
-+private:
-+    T m_Value;
-+};
-+
-+#define DEFINE_KNOB(_name, _type, _default)         \\
-+
-+    struct Knob_##_name : Knob<_type>               \\
-+
-+    {   Knob_##_name() { Value(_default); }         \\
-+
-+        const char* Name() const { return "KNOB_" #_name; } \\
-+
-+    } _name;
-+
-+#define GET_KNOB(_name)             g_GlobalKnobs._name.Value()
-+#define SET_KNOB(_name, _newValue)  g_GlobalKnobs._name.Value(_newValue)
-+
-+struct GlobalKnobs
-+{
-+    % for knob in knobs:
-+    //-----------------------------------------------------------
-+    // KNOB_${knob[0]}
-+    //
-+    % for line in knob[1]['desc']:
-+    // ${line}
-+    % endfor
-+    DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
-+
-+    % endfor
-+    GlobalKnobs();
-+};
-+extern GlobalKnobs g_GlobalKnobs;
-+
-+<%
-+    max_len = 0
-+    for knob in knobs:
-+        if len(knob[0]) > max_len: max_len = len(knob[0])
-+    max_len += len('KNOB_ ')
-+    if max_len % 4: max_len += 4 - (max_len % 4)
-+
-+    def space_knob(knob):
-+        knob_len = len('KNOB_' + knob)
-+        return ' '*(max_len - knob_len)
-+%>
-+% for knob in knobs:
-+#define KNOB_${knob[0]}${space_knob(knob[0])}GET_KNOB(${knob[0]})
-+% endfor
-+
-+% else:
-+% for inc in includes:
-+#include <${inc}>
-+% endfor
-+
-+//========================================================
-+// Static Data Members
-+//========================================================
-+GlobalKnobs g_GlobalKnobs;
-+
-+//========================================================
-+// Knob Initialization
-+//========================================================
-+GlobalKnobs::GlobalKnobs()
-+{
-+    % for knob in knobs:
-+    InitKnob(${knob[0]});
-+    % endfor
-+}
-+
-+% endif
--- 
-2.6.2
-
diff --git a/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch b/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch
deleted file mode 100644
index 239130f..0000000
--- a/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-From fe9e5f557953d3c4b9c3cac6be0ff29d97c3f2c7 Mon Sep 17 00:00:00 2001
-From: Igor Gnatenko <i.gnatenko.brain@gmail.com>
-Date: Thu, 22 Oct 2015 17:08:04 +0200
-Subject: [PATCH 3/3] gallium/swr: add flags parameter to
- pipe_screen::context_create
-
-Signed-off-by: Igor Gnatenko <i.gnatenko.brain@gmail.com>
----
- src/gallium/drivers/swr/swr_context.cpp | 3 ++-
- src/gallium/drivers/swr/swr_context.h   | 2 +-
- 2 files changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
-index 6269cd0..2dd3443 100644
---- a/src/gallium/drivers/swr/swr_context.cpp
-+++ b/src/gallium/drivers/swr/swr_context.cpp
-@@ -336,7 +336,8 @@ swr_render_condition(struct pipe_context *pipe,
- 
- 
- struct pipe_context *
--swr_create_context(struct pipe_screen *screen, void *priv)
-+swr_create_context(struct pipe_screen *screen, void *priv,
-+                   unsigned flags)
- {
-    struct swr_context *ctx = CALLOC_STRUCT(swr_context);
-    ctx->blendJIT =
-diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
-index 9d93a6d..5271eac 100644
---- a/src/gallium/drivers/swr/swr_context.h
-+++ b/src/gallium/drivers/swr/swr_context.h
-@@ -160,7 +160,7 @@ swr_context(struct pipe_context *pipe)
-    return (struct swr_context *)pipe;
- }
- 
--struct pipe_context *swr_create_context(struct pipe_screen *, void *priv);
-+struct pipe_context *swr_create_context(struct pipe_screen *, void *priv, unsigned flags);
- 
- void swr_state_init(struct pipe_context *pipe);
- 
--- 
-2.6.2
-
diff --git a/mesa.spec b/mesa.spec
index 6ca184d..9c81d93 100644
--- a/mesa.spec
+++ b/mesa.spec
@@ -17,7 +17,6 @@
 %define min_wayland_version 1.0
 %if 0%{?with_llvm}
 %define with_radeonsi 1
-%define with_swr 1
 %endif
 
 %ifarch s390 s390x ppc
@@ -75,10 +74,6 @@ Patch15: mesa-9.2-hardware-float.patch
 Patch20: mesa-10.2-evergreen-big-endian.patch
 Patch30: mesa-10.3-bigendian-assert.patch
 
-Patch101: 0001-Initial-public-Mesa-SWR.patch
-Patch102: 0002-swr-484541-Initial-public-SWR.patch
-Patch103: 0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch
-
 # To have sha info in glxinfo
 BuildRequires: git-core
 
@@ -353,10 +348,6 @@ grep -q ^/ src/gallium/auxiliary/vl/vl_decoder.c && exit 1
 %patch20 -p1 -b .egbe
 %patch30 -p1 -b .beassert
 
-%patch101 -p1
-%patch102 -p1
-%patch103 -p1
-
 %if 0%{with_private_llvm}
 sed -i 's/llvm-config/mesa-private-llvm-config-%{__isa_bits}/g' configure.ac
 sed -i 's/`$LLVM_CONFIG --version`/&-mesa/' configure.ac
@@ -404,8 +395,7 @@ export CXXFLAGS="$RPM_OPT_FLAGS %{?with_opencl:-frtti -fexceptions} %{!?with_ope
 %if %{with_hardware}
     %{?with_xa:--enable-xa} \
     %{?with_nine:--enable-nine} \
-    --with-gallium-drivers=%{?with_vmware:svga,}%{?with_radeonsi:radeonsi,}%{?with_llvm:swrast,r600,}%{?with_freedreno:freedreno,}%{?with_vc4:vc4,}%{?with_ilo:ilo,}%{?with_swr:swr,}r300,nouveau \
-    %{?with_swr:--enable-swr-native} \
+    --with-gallium-drivers=%{?with_vmware:svga,}%{?with_radeonsi:radeonsi,}%{?with_llvm:swrast,r600,}%{?with_freedreno:freedreno,}%{?with_vc4:vc4,}%{?with_ilo:ilo,}r300,nouveau \
 %else
     --with-gallium-drivers=%{?with_llvm:swrast} \
 %endif
@@ -687,6 +677,7 @@ rm -rf $RPM_BUILD_ROOT
 %changelog
 * Thu Oct 22 2015 Igor Gnatenko <i.gnatenko.brain@gmail.com> - 11.1.0-0.devel.10.7182498
 - 7182498
+- Disable SWR rasterizer
 
 * Wed Oct 21 2015 Igor Gnatenko <i.gnatenko.brain@gmail.com> - 11.1.0-0.devel.9.4a168ad
 - Enable experimental SWR rasterizer