diff --git a/drivers/gpu/msm/Makefile b/drivers/gpu/msm/Makefile
index fec53633bbebb05ac7d23a4fa37f2ff5eadf23d4..1a6b072a70772ac13c7d2d2d7635e49e0fd0cbad 100644
--- a/drivers/gpu/msm/Makefile
+++ b/drivers/gpu/msm/Makefile
@@ -23,8 +23,10 @@ msm_kgsl_core-$(CONFIG_SYNC) += kgsl_sync.o
 msm_adreno-y += \
 	adreno_ringbuffer.o \
 	adreno_drawctxt.o \
+	adreno_dispatch.o \
 	adreno_postmortem.o \
 	adreno_snapshot.o \
+	adreno_trace.o \
 	adreno_a2xx.o \
 	adreno_a2xx_trace.o \
 	adreno_a2xx_snapshot.o \
diff --git a/drivers/gpu/msm/a3xx_reg.h b/drivers/gpu/msm/a3xx_reg.h
index be9f3acebcc5726f620248a8e7772fadffe108e3..21d475954fb22447b900785b3801797c169470e5 100644
--- a/drivers/gpu/msm/a3xx_reg.h
+++ b/drivers/gpu/msm/a3xx_reg.h
@@ -66,15 +66,103 @@
 #define A3XX_RBBM_INT_0_MASK 0x063
 #define A3XX_RBBM_INT_0_STATUS 0x064
 #define A3XX_RBBM_PERFCTR_CTL 0x80
+#define A3XX_RBBM_PERFCTR_LOAD_CMD0 0x81
+#define A3XX_RBBM_PERFCTR_LOAD_CMD1 0x82
+#define A3XX_RBBM_PERFCTR_LOAD_VALUE_LO 0x84
+#define A3XX_RBBM_PERFCTR_LOAD_VALUE_HI 0x85
+#define A3XX_RBBM_PERFCOUNTER0_SELECT 0x86
+#define A3XX_RBBM_PERFCOUNTER1_SELECT 0x87
 #define A3XX_RBBM_GPU_BUSY_MASKED 0x88
+#define A3XX_RBBM_PERFCTR_CP_0_LO 0x90
+#define A3XX_RBBM_PERFCTR_CP_0_HI 0x91
+#define A3XX_RBBM_PERFCTR_RBBM_0_LO 0x92
+#define A3XX_RBBM_PERFCTR_RBBM_0_HI 0x93
+#define A3XX_RBBM_PERFCTR_RBBM_1_LO 0x94
+#define A3XX_RBBM_PERFCTR_RBBM_1_HI 0x95
+#define A3XX_RBBM_PERFCTR_PC_0_LO 0x96
+#define A3XX_RBBM_PERFCTR_PC_0_HI 0x97
+#define A3XX_RBBM_PERFCTR_PC_1_LO 0x98
+#define A3XX_RBBM_PERFCTR_PC_1_HI 0x99
+#define A3XX_RBBM_PERFCTR_PC_2_LO 0x9A
+#define A3XX_RBBM_PERFCTR_PC_2_HI 0x9B
+#define A3XX_RBBM_PERFCTR_PC_3_LO 0x9C
+#define A3XX_RBBM_PERFCTR_PC_3_HI 0x9D
+#define A3XX_RBBM_PERFCTR_VFD_0_LO 0x9E
+#define A3XX_RBBM_PERFCTR_VFD_0_HI 0x9F
+#define A3XX_RBBM_PERFCTR_VFD_1_LO 0xA0
+#define A3XX_RBBM_PERFCTR_VFD_1_HI 0xA1
+#define A3XX_RBBM_PERFCTR_HLSQ_0_LO 0xA2
+#define A3XX_RBBM_PERFCTR_HLSQ_0_HI 0xA3
+#define A3XX_RBBM_PERFCTR_HLSQ_1_LO 0xA4
+#define A3XX_RBBM_PERFCTR_HLSQ_1_HI 0xA5
+#define A3XX_RBBM_PERFCTR_HLSQ_2_LO 0xA6
+#define A3XX_RBBM_PERFCTR_HLSQ_2_HI 0xA7
+#define A3XX_RBBM_PERFCTR_HLSQ_3_LO 0xA8
+#define A3XX_RBBM_PERFCTR_HLSQ_3_HI 0xA9
+#define A3XX_RBBM_PERFCTR_HLSQ_4_LO 0xAA
+#define A3XX_RBBM_PERFCTR_HLSQ_4_HI 0xAB
+#define A3XX_RBBM_PERFCTR_HLSQ_5_LO 0xAC
+#define A3XX_RBBM_PERFCTR_HLSQ_5_HI 0xAD
+#define A3XX_RBBM_PERFCTR_VPC_0_LO 0xAE
+#define A3XX_RBBM_PERFCTR_VPC_0_HI 0xAF
+#define A3XX_RBBM_PERFCTR_VPC_1_LO 0xB0
+#define A3XX_RBBM_PERFCTR_VPC_1_HI 0xB1
+#define A3XX_RBBM_PERFCTR_TSE_0_LO 0xB2
+#define A3XX_RBBM_PERFCTR_TSE_0_HI 0xB3
+#define A3XX_RBBM_PERFCTR_TSE_1_LO 0xB4
+#define A3XX_RBBM_PERFCTR_TSE_1_HI 0xB5
+#define A3XX_RBBM_PERFCTR_RAS_0_LO 0xB6
+#define A3XX_RBBM_PERFCTR_RAS_0_HI 0xB7
+#define A3XX_RBBM_PERFCTR_RAS_1_LO 0xB8
+#define A3XX_RBBM_PERFCTR_RAS_1_HI 0xB9
+#define A3XX_RBBM_PERFCTR_UCHE_0_LO 0xBA
+#define A3XX_RBBM_PERFCTR_UCHE_0_HI 0xBB
+#define A3XX_RBBM_PERFCTR_UCHE_1_LO 0xBC
+#define A3XX_RBBM_PERFCTR_UCHE_1_HI 0xBD
+#define A3XX_RBBM_PERFCTR_UCHE_2_LO 0xBE
+#define A3XX_RBBM_PERFCTR_UCHE_2_HI 0xBF
+#define A3XX_RBBM_PERFCTR_UCHE_3_LO 0xC0
+#define A3XX_RBBM_PERFCTR_UCHE_3_HI 0xC1
+#define A3XX_RBBM_PERFCTR_UCHE_4_LO 0xC2
+#define A3XX_RBBM_PERFCTR_UCHE_4_HI 0xC3
+#define A3XX_RBBM_PERFCTR_UCHE_5_LO 0xC4
+#define A3XX_RBBM_PERFCTR_UCHE_5_HI 0xC5
+#define A3XX_RBBM_PERFCTR_TP_0_LO 0xC6
+#define A3XX_RBBM_PERFCTR_TP_0_HI 0xC7
+#define A3XX_RBBM_PERFCTR_TP_1_LO 0xC8
+#define A3XX_RBBM_PERFCTR_TP_1_HI 0xC9
+#define A3XX_RBBM_PERFCTR_TP_2_LO 0xCA
+#define A3XX_RBBM_PERFCTR_TP_2_HI 0xCB
+#define A3XX_RBBM_PERFCTR_TP_3_LO 0xCC
+#define A3XX_RBBM_PERFCTR_TP_3_HI 0xCD
+#define A3XX_RBBM_PERFCTR_TP_4_LO 0xCE
+#define A3XX_RBBM_PERFCTR_TP_4_HI 0xCF
+#define A3XX_RBBM_PERFCTR_TP_5_LO 0xD0
+#define A3XX_RBBM_PERFCTR_TP_5_HI 0xD1
+#define A3XX_RBBM_PERFCTR_SP_0_LO 0xD2
+#define A3XX_RBBM_PERFCTR_SP_0_HI 0xD3
+#define A3XX_RBBM_PERFCTR_SP_1_LO 0xD4
+#define A3XX_RBBM_PERFCTR_SP_1_HI 0xD5
+#define A3XX_RBBM_PERFCTR_SP_2_LO 0xD6
+#define A3XX_RBBM_PERFCTR_SP_2_HI 0xD7
+#define A3XX_RBBM_PERFCTR_SP_3_LO 0xD8
+#define A3XX_RBBM_PERFCTR_SP_3_HI 0xD9
+#define A3XX_RBBM_PERFCTR_SP_4_LO 0xDA
+#define A3XX_RBBM_PERFCTR_SP_4_HI 0xDB
 #define A3XX_RBBM_PERFCTR_SP_5_LO 0xDC
 #define A3XX_RBBM_PERFCTR_SP_5_HI 0xDD
 #define A3XX_RBBM_PERFCTR_SP_6_LO 0xDE
 #define A3XX_RBBM_PERFCTR_SP_6_HI 0xDF
 #define A3XX_RBBM_PERFCTR_SP_7_LO 0xE0
 #define A3XX_RBBM_PERFCTR_SP_7_HI 0xE1
+#define A3XX_RBBM_PERFCTR_RB_0_LO 0xE2
+#define A3XX_RBBM_PERFCTR_RB_0_HI 0xE3
+#define A3XX_RBBM_PERFCTR_RB_1_LO 0xE4
+#define A3XX_RBBM_PERFCTR_RB_1_HI 0xE5
+
 #define A3XX_RBBM_RBBM_CTL 0x100
-#define A3XX_RBBM_RBBM_CTL 0x100
+#define A3XX_RBBM_PERFCTR_PWR_0_LO 0x0EA
+#define A3XX_RBBM_PERFCTR_PWR_0_HI 0x0EB
 #define A3XX_RBBM_PERFCTR_PWR_1_LO 0x0EC
 #define A3XX_RBBM_PERFCTR_PWR_1_HI 0x0ED
 #define A3XX_RBBM_DEBUG_BUS_CTL             0x111
@@ -90,6 +178,7 @@
 #define A3XX_CP_MERCIU_DATA2 0x1D3
 #define A3XX_CP_MEQ_ADDR 0x1DA
 #define A3XX_CP_MEQ_DATA 0x1DB
+#define A3XX_CP_PERFCOUNTER_SELECT 0x445
 #define A3XX_CP_HW_FAULT  0x45C
 #define A3XX_CP_AHB_FAULT 0x54D
 #define A3XX_CP_PROTECT_CTRL 0x45E
@@ -138,6 +227,14 @@
 #define A3XX_VSC_PIPE_CONFIG_7 0xC1B
 #define A3XX_VSC_PIPE_DATA_ADDRESS_7 0xC1C
 #define A3XX_VSC_PIPE_DATA_LENGTH_7 0xC1D
+#define A3XX_PC_PERFCOUNTER0_SELECT 0xC48
+#define A3XX_PC_PERFCOUNTER1_SELECT 0xC49
+#define A3XX_PC_PERFCOUNTER2_SELECT 0xC4A
+#define A3XX_PC_PERFCOUNTER3_SELECT 0xC4B
+#define A3XX_GRAS_PERFCOUNTER0_SELECT 0xC88
+#define A3XX_GRAS_PERFCOUNTER1_SELECT 0xC89
+#define A3XX_GRAS_PERFCOUNTER2_SELECT 0xC8A
+#define A3XX_GRAS_PERFCOUNTER3_SELECT 0xC8B
 #define A3XX_GRAS_CL_USER_PLANE_X0 0xCA0
 #define A3XX_GRAS_CL_USER_PLANE_Y0 0xCA1
 #define A3XX_GRAS_CL_USER_PLANE_Z0 0xCA2
@@ -163,14 +260,42 @@
 #define A3XX_GRAS_CL_USER_PLANE_Z5 0xCB6
 #define A3XX_GRAS_CL_USER_PLANE_W5 0xCB7
 #define A3XX_RB_GMEM_BASE_ADDR 0xCC0
+#define A3XX_RB_PERFCOUNTER0_SELECT   0xCC6
+#define A3XX_RB_PERFCOUNTER1_SELECT   0xCC7
+#define A3XX_HLSQ_PERFCOUNTER0_SELECT 0xE00
+#define A3XX_HLSQ_PERFCOUNTER1_SELECT 0xE01
+#define A3XX_HLSQ_PERFCOUNTER2_SELECT 0xE02
+#define A3XX_HLSQ_PERFCOUNTER3_SELECT 0xE03
+#define A3XX_HLSQ_PERFCOUNTER4_SELECT 0xE04
+#define A3XX_HLSQ_PERFCOUNTER5_SELECT 0xE05
 #define A3XX_VFD_PERFCOUNTER0_SELECT 0xE44
+#define A3XX_VFD_PERFCOUNTER1_SELECT 0xE45
 #define A3XX_VPC_VPC_DEBUG_RAM_SEL 0xE61
 #define A3XX_VPC_VPC_DEBUG_RAM_READ 0xE62
+#define A3XX_VPC_PERFCOUNTER0_SELECT 0xE64
+#define A3XX_VPC_PERFCOUNTER1_SELECT 0xE65
 #define A3XX_UCHE_CACHE_MODE_CONTROL_REG 0xE82
+#define A3XX_UCHE_PERFCOUNTER0_SELECT 0xE84
+#define A3XX_UCHE_PERFCOUNTER1_SELECT 0xE85
+#define A3XX_UCHE_PERFCOUNTER2_SELECT 0xE86
+#define A3XX_UCHE_PERFCOUNTER3_SELECT 0xE87
+#define A3XX_UCHE_PERFCOUNTER4_SELECT 0xE88
+#define A3XX_UCHE_PERFCOUNTER5_SELECT 0xE89
 #define A3XX_UCHE_CACHE_INVALIDATE0_REG 0xEA0
+#define A3XX_SP_PERFCOUNTER0_SELECT 0xEC4
+#define A3XX_SP_PERFCOUNTER1_SELECT 0xEC5
+#define A3XX_SP_PERFCOUNTER2_SELECT 0xEC6
+#define A3XX_SP_PERFCOUNTER3_SELECT 0xEC7
+#define A3XX_SP_PERFCOUNTER4_SELECT 0xEC8
 #define A3XX_SP_PERFCOUNTER5_SELECT 0xEC9
 #define A3XX_SP_PERFCOUNTER6_SELECT 0xECA
 #define A3XX_SP_PERFCOUNTER7_SELECT 0xECB
+#define A3XX_TP_PERFCOUNTER0_SELECT 0xF04
+#define A3XX_TP_PERFCOUNTER1_SELECT 0xF05
+#define A3XX_TP_PERFCOUNTER2_SELECT 0xF06
+#define A3XX_TP_PERFCOUNTER3_SELECT 0xF07
+#define A3XX_TP_PERFCOUNTER4_SELECT 0xF08
+#define A3XX_TP_PERFCOUNTER5_SELECT 0xF09
 #define A3XX_GRAS_CL_CLIP_CNTL 0x2040
 #define A3XX_GRAS_CL_GB_CLIP_ADJ 0x2044
 #define A3XX_GRAS_CL_VPORT_XOFFSET 0x2048
@@ -232,12 +357,14 @@
 #define A3XX_SP_VS_OUT_REG_7 0x22CE
 #define A3XX_SP_VS_VPC_DST_REG_0 0x22D0
 #define A3XX_SP_VS_OBJ_OFFSET_REG 0x22D4
+#define A3XX_SP_VS_OBJ_START_REG 0x22D5
 #define A3XX_SP_VS_PVT_MEM_ADDR_REG 0x22D7
 #define A3XX_SP_VS_PVT_MEM_SIZE_REG 0x22D8
 #define A3XX_SP_VS_LENGTH_REG 0x22DF
 #define A3XX_SP_FS_CTRL_REG0 0x22E0
 #define A3XX_SP_FS_CTRL_REG1 0x22E1
 #define A3XX_SP_FS_OBJ_OFFSET_REG 0x22E2
+#define A3XX_SP_FS_OBJ_START_REG 0x22E3
 #define A3XX_SP_FS_PVT_MEM_ADDR_REG 0x22E5
 #define A3XX_SP_FS_PVT_MEM_SIZE_REG 0x22E6
 #define A3XX_SP_FS_FLAT_SHAD_MODE_REG_0 0x22E8
@@ -269,10 +396,25 @@
 #define A3XX_VBIF_OUT_AXI_AMEMTYPE_CONF0 0x3058
 #define A3XX_VBIF_OUT_AXI_AOOO_EN 0x305E
 #define A3XX_VBIF_OUT_AXI_AOOO 0x305F
+#define A3XX_VBIF_PERF_CNT_EN 0x3070
+#define A3XX_VBIF_PERF_CNT_CLR 0x3071
+#define A3XX_VBIF_PERF_CNT_SEL 0x3072
+#define A3XX_VBIF_PERF_CNT0_LO 0x3073
+#define A3XX_VBIF_PERF_CNT0_HI 0x3074
+#define A3XX_VBIF_PERF_CNT1_LO 0x3075
+#define A3XX_VBIF_PERF_CNT1_HI 0x3076
+#define A3XX_VBIF_PERF_PWR_CNT0_LO 0x3077
+#define A3XX_VBIF_PERF_PWR_CNT0_HI 0x3078
+#define A3XX_VBIF_PERF_PWR_CNT1_LO 0x3079
+#define A3XX_VBIF_PERF_PWR_CNT1_HI 0x307a
+#define A3XX_VBIF_PERF_PWR_CNT2_LO 0x307b
+#define A3XX_VBIF_PERF_PWR_CNT2_HI 0x307c
 
 /* Bit flags for RBBM_CTL */
-#define RBBM_RBBM_CTL_RESET_PWR_CTR1  (1 << 1)
-#define RBBM_RBBM_CTL_ENABLE_PWR_CTR1  (1 << 17)
+#define RBBM_RBBM_CTL_RESET_PWR_CTR0  BIT(0)
+#define RBBM_RBBM_CTL_RESET_PWR_CTR1  BIT(1)
+#define RBBM_RBBM_CTL_ENABLE_PWR_CTR0  BIT(16)
+#define RBBM_RBBM_CTL_ENABLE_PWR_CTR1  BIT(17)
 
 /* Various flags used by the context switch code */
 
@@ -537,7 +679,13 @@
 #define RBBM_BLOCK_ID_MARB_3           0x2b
 
 /* RBBM_CLOCK_CTL default value */
-#define A3XX_RBBM_CLOCK_CTL_DEFAULT 0xBFFFFFFF
+#define A305_RBBM_CLOCK_CTL_DEFAULT   0xAAAAAAAA
+#define A320_RBBM_CLOCK_CTL_DEFAULT   0xBFFFFFFF
+#define A330_RBBM_CLOCK_CTL_DEFAULT   0xAAAAAAAE
+#define A330v2_RBBM_CLOCK_CTL_DEFAULT 0xAAAAAAAA
+
+#define A330_RBBM_GPR0_CTL_DEFAULT  0x0AE2B8AE
+#define A330v2_RBBM_GPR0_CTL_DEFAULT  0x0AA2A8AA
 
 /* COUNTABLE FOR SP PERFCOUNTER */
 #define SP_FS_FULL_ALU_INSTRUCTIONS    0x0E
@@ -545,4 +693,20 @@
 #define SP0_ICL1_MISSES                0x1A
 #define SP_FS_CFLOW_INSTRUCTIONS       0x0C
 
+/* VBIF PERFCOUNTER ENA/CLR values */
+#define VBIF_PERF_CNT_0 BIT(0)
+#define VBIF_PERF_CNT_1 BIT(1)
+#define VBIF_PERF_PWR_CNT_0 BIT(2)
+#define VBIF_PERF_PWR_CNT_1 BIT(3)
+#define VBIF_PERF_PWR_CNT_2 BIT(4)
+
+/* VBIF PERFCOUNTER SEL values */
+#define VBIF_PERF_CNT_0_SEL 0
+#define VBIF_PERF_CNT_0_SEL_MASK 0x7f
+#define VBIF_PERF_CNT_1_SEL 8
+#define VBIF_PERF_CNT_1_SEL_MASK 0x7f00
+
+/* VBIF countables */
+#define VBIF_DDR_TOTAL_CYCLES 110
+
 #endif
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 1ad44646fa7cb3a179e02c3d978505e82f4325d4..4eb982186e734fae63d9255f9e1ec19cb2dcbaf3 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -30,10 +30,10 @@
 #include "kgsl_cffdump.h"
 #include "kgsl_sharedmem.h"
 #include "kgsl_iommu.h"
-#include "kgsl_trace.h"
 
 #include "adreno.h"
 #include "adreno_pm4types.h"
+#include "adreno_trace.h"
 
 #include "a2xx_reg.h"
 #include "a3xx_reg.h"
@@ -117,19 +117,10 @@ static struct adreno_device device_3d0 = {
 	.ib_check_level = 0,
 };
 
-/* This set of registers are used for Hang detection
- * If the values of these registers are same after
- * KGSL_TIMEOUT_PART time, GPU hang is reported in
- * kernel log.
- * *****ALERT******ALERT********ALERT*************
- * Order of registers below is important, registers
- * from LONG_IB_DETECT_REG_INDEX_START to
- * LONG_IB_DETECT_REG_INDEX_END are used in long ib detection.
- */
 #define LONG_IB_DETECT_REG_INDEX_START 1
 #define LONG_IB_DETECT_REG_INDEX_END 5
 
-unsigned int ft_detect_regs[] = {
+unsigned int ft_detect_regs[FT_DETECT_REGS_COUNT] = {
 	A3XX_RBBM_STATUS,
 	REG_CP_RB_RPTR,   /* LONG_IB_DETECT_REG_INDEX_START */
 	REG_CP_IB1_BASE,
@@ -144,8 +135,6 @@ unsigned int ft_detect_regs[] = {
 	0
 };
 
-const unsigned int ft_detect_regs_count = ARRAY_SIZE(ft_detect_regs);
-
 /*
  * This is the master list of all GPU cores that are supported by this
  * driver.
@@ -206,31 +195,323 @@ static const struct {
 	{ ADRENO_REV_A320, 3, 2, ANY_ID, ANY_ID,
 		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
 		512, 0, 2, SZ_512K, 0x3FF037, 0x3FF016 },
-	{ ADRENO_REV_A330, 3, 3, 0, 0,
+	{ ADRENO_REV_A330, 3, 3, 0, ANY_ID,
 		"a330_pm4.fw", "a330_pfp.fw", &adreno_a3xx_gpudev,
 		512, 0, 2, SZ_1M, NO_VER, NO_VER },
 };
 
-static irqreturn_t adreno_irq_handler(struct kgsl_device *device)
+/**
+ * adreno_perfcounter_init: Reserve kernel performance counters
+ * @device: device to configure
+ *
+ * The kernel needs/wants a certain group of performance counters for
+ * its own activities.  Reserve these performance counters at init time
+ * to ensure that they are always reserved for the kernel.  The performance
+ * counters used by the kernel can be obtained by the user, but these
+ * performance counters will remain active as long as the device is alive.
+ */
+
+static void adreno_perfcounter_init(struct kgsl_device *device)
 {
-	irqreturn_t result;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	result = adreno_dev->gpudev->irq_handler(adreno_dev);
+	if (adreno_dev->gpudev->perfcounter_init)
+		adreno_dev->gpudev->perfcounter_init(adreno_dev);
+};
+
+/**
+ * adreno_perfcounter_start: Enable performance counters
+ * @adreno_dev: Adreno device to configure
+ *
+ * Ensure all performance counters are enabled that are allocated.  Since
+ * the device was most likely stopped, we can't trust that the counters
+ * are still valid so make it so.
+ */
+
+static void adreno_perfcounter_start(struct adreno_device *adreno_dev)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	unsigned int i, j;
+
+	/* group id iter */
+	for (i = 0; i < counters->group_count; i++) {
+		group = &(counters->groups[i]);
+
+		/* countable iter */
+		for (j = 0; j < group->reg_count; j++) {
+			if (group->regs[j].countable ==
+					KGSL_PERFCOUNTER_NOT_USED)
+				continue;
 
-	if (device->requested_state == KGSL_STATE_NONE) {
-		if (device->pwrctrl.nap_allowed == true) {
-			kgsl_pwrctrl_request_state(device, KGSL_STATE_NAP);
-			queue_work(device->work_queue, &device->idle_check_ws);
-		} else if (device->pwrscale.policy != NULL) {
-			queue_work(device->work_queue, &device->idle_check_ws);
+			if (adreno_dev->gpudev->perfcounter_enable)
+				adreno_dev->gpudev->perfcounter_enable(
+					adreno_dev, i, j,
+					group->regs[j].countable);
 		}
 	}
+}
 
-	/* Reset the time-out in our idle timer */
-	mod_timer_pending(&device->idle_timer,
-		jiffies + device->pwrctrl.interval_timeout);
-	return result;
+/**
+ * adreno_perfcounter_read_group: Determine which countables are in counters
+ * @adreno_dev: Adreno device to configure
+ * @reads: List of kgsl_perfcounter_read_groups
+ * @count: Length of list
+ *
+ * Read the performance counters for the groupid/countable pairs and return
+ * the 64 bit result for each pair
+ */
+
+int adreno_perfcounter_read_group(struct adreno_device *adreno_dev,
+	struct kgsl_perfcounter_read_group *reads, unsigned int count)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	struct kgsl_perfcounter_read_group *list = NULL;
+	unsigned int i, j;
+	int ret = 0;
+
+	/* perfcounter get/put/query/read not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	/* sanity check for later */
+	if (!adreno_dev->gpudev->perfcounter_read)
+		return -EINVAL;
+
+	/* sanity check params passed in */
+	if (reads == NULL || count == 0 || count > 100)
+		return -EINVAL;
+
+	/* verify valid inputs group ids and countables */
+	for (i = 0; i < count; i++) {
+		if (reads[i].groupid >= counters->group_count)
+			return -EINVAL;
+	}
+
+	list = kmalloc(sizeof(struct kgsl_perfcounter_read_group) * count,
+			GFP_KERNEL);
+	if (!list)
+		return -ENOMEM;
+
+	if (copy_from_user(list, reads,
+			sizeof(struct kgsl_perfcounter_read_group) * count)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* list iterator */
+	for (j = 0; j < count; j++) {
+		list[j].value = 0;
+
+		group = &(counters->groups[list[j].groupid]);
+
+		/* group/counter iterator */
+		for (i = 0; i < group->reg_count; i++) {
+			if (group->regs[i].countable == list[j].countable) {
+				list[j].value =
+					adreno_dev->gpudev->perfcounter_read(
+					adreno_dev, list[j].groupid,
+					i, group->regs[i].offset);
+				break;
+			}
+		}
+	}
+
+	/* write the data */
+	if (copy_to_user(reads, list,
+			sizeof(struct kgsl_perfcounter_read_group) *
+			count) != 0)
+		ret = -EFAULT;
+
+done:
+	kfree(list);
+	return ret;
+}
+
+/**
+ * adreno_perfcounter_query_group: Determine which countables are in counters
+ * @adreno_dev: Adreno device to configure
+ * @groupid: Desired performance counter group
+ * @countables: Return list of all countables in the groups counters
+ * @count: Max length of the array
+ * @max_counters: max counters for the groupid
+ *
+ * Query the current state of counters for the group.
+ */
+
+int adreno_perfcounter_query_group(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int *countables, unsigned int count,
+	unsigned int *max_counters)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	unsigned int i;
+
+	*max_counters = 0;
+
+	/* perfcounter get/put/query not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	if (groupid >= counters->group_count)
+		return -EINVAL;
+
+	group = &(counters->groups[groupid]);
+	*max_counters = group->reg_count;
+
+	/*
+	 * if NULL countable or *count of zero, return max reg_count in
+	 * *max_counters and return success
+	 */
+	if (countables == NULL || count == 0)
+		return 0;
+
+	/*
+	 * Go through all available counters.  Write upto *count * countable
+	 * values.
+	 */
+	for (i = 0; i < group->reg_count && i < count; i++) {
+		if (copy_to_user(&countables[i], &(group->regs[i].countable),
+				sizeof(unsigned int)) != 0)
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * adreno_perfcounter_get: Try to put a countable in an available counter
+ * @adreno_dev: Adreno device to configure
+ * @groupid: Desired performance counter group
+ * @countable: Countable desired to be in a counter
+ * @offset: Return offset of the countable
+ * @flags: Used to setup kernel perf counters
+ *
+ * Try to place a countable in an available counter.  If the countable is
+ * already in a counter, reference count the counter/countable pair resource
+ * and return success
+ */
+
+int adreno_perfcounter_get(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable, unsigned int *offset,
+	unsigned int flags)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+	unsigned int i, empty = -1;
+
+	/* always clear return variables */
+	if (offset)
+		*offset = 0;
+
+	/* perfcounter get/put/query not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	if (groupid >= counters->group_count)
+		return -EINVAL;
+
+	group = &(counters->groups[groupid]);
+
+	/*
+	 * Check if the countable is already associated with a counter.
+	 * Refcount and return the offset, otherwise, try and find an empty
+	 * counter and assign the countable to it.
+	 */
+	for (i = 0; i < group->reg_count; i++) {
+		if (group->regs[i].countable == countable) {
+			/* Countable already associated with counter */
+			group->regs[i].refcount++;
+			group->regs[i].flags |= flags;
+			if (offset)
+				*offset = group->regs[i].offset;
+			return 0;
+		} else if (group->regs[i].countable ==
+			KGSL_PERFCOUNTER_NOT_USED) {
+			/* keep track of unused counter */
+			empty = i;
+		}
+	}
+
+	/* no available counters, so do nothing else */
+	if (empty == -1)
+		return -EBUSY;
+
+	/* initialize the new counter */
+	group->regs[empty].countable = countable;
+	group->regs[empty].refcount = 1;
+
+	/* enable the new counter */
+	adreno_dev->gpudev->perfcounter_enable(adreno_dev, groupid, empty,
+		countable);
+
+	group->regs[empty].flags = flags;
+
+	if (offset)
+		*offset = group->regs[empty].offset;
+
+	return 0;
+}
+
+
+/**
+ * adreno_perfcounter_put: Release a countable from counter resource
+ * @adreno_dev: Adreno device to configure
+ * @groupid: Desired performance counter group
+ * @countable: Countable desired to be freed from a  counter
+ *
+ * Put a performance counter/countable pair that was previously received.  If
+ * noone else is using the countable, free up the counter for others.
+ */
+int adreno_perfcounter_put(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable)
+{
+	struct adreno_perfcounters *counters = adreno_dev->gpudev->perfcounters;
+	struct adreno_perfcount_group *group;
+
+	unsigned int i;
+
+	/* perfcounter get/put/query not allowed on a2xx */
+	if (adreno_is_a2xx(adreno_dev))
+		return -EINVAL;
+
+	if (groupid >= counters->group_count)
+		return -EINVAL;
+
+	group = &(counters->groups[groupid]);
+
+	for (i = 0; i < group->reg_count; i++) {
+		if (group->regs[i].countable == countable) {
+			if (group->regs[i].refcount > 0) {
+				group->regs[i].refcount--;
+
+				/*
+				 * book keeping to ensure we never free a
+				 * perf counter used by kernel
+				 */
+				if (group->regs[i].flags &&
+					group->regs[i].refcount == 0)
+					group->regs[i].refcount++;
+
+				/* make available if not used */
+				if (group->regs[i].refcount == 0)
+					group->regs[i].countable =
+						KGSL_PERFCOUNTER_NOT_USED;
+			}
+
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static irqreturn_t adreno_irq_handler(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+	return adreno_dev->gpudev->irq_handler(adreno_dev);
 }
 
 static void adreno_cleanup_pt(struct kgsl_device *device,
@@ -255,23 +536,19 @@ static int adreno_setup_pt(struct kgsl_device *device,
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
-	result = kgsl_mmu_map_global(pagetable, &rb->buffer_desc,
-				     GSL_PT_PAGE_RV);
+	result = kgsl_mmu_map_global(pagetable, &rb->buffer_desc);
 	if (result)
 		goto error;
 
-	result = kgsl_mmu_map_global(pagetable, &rb->memptrs_desc,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &rb->memptrs_desc);
 	if (result)
 		goto unmap_buffer_desc;
 
-	result = kgsl_mmu_map_global(pagetable, &device->memstore,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->memstore);
 	if (result)
 		goto unmap_memptrs_desc;
 
-	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory);
 	if (result)
 		goto unmap_memstore_desc;
 
@@ -297,7 +574,7 @@ error:
 	return result;
 }
 
-static void adreno_iommu_setstate(struct kgsl_device *device,
+static int adreno_iommu_setstate(struct kgsl_device *device,
 					unsigned int context_id,
 					uint32_t flags)
 {
@@ -309,13 +586,15 @@ static void adreno_iommu_setstate(struct kgsl_device *device,
 	int num_iommu_units, i;
 	struct kgsl_context *context;
 	struct adreno_context *adreno_ctx = NULL;
+	int result = 0;
 
 	/*
 	 * If we're idle and we don't need to use the GPU to save context
 	 * state, use the CPU instead of the GPU to reprogram the
 	 * iommu for simplicity's sake.
 	 */
-	 if (!adreno_dev->drawctxt_active || device->ftbl->isidle(device))
+	 if (!adreno_dev->drawctxt_active || device->ftbl->isidle(device) ||
+		 !atomic_read(&device->active_cnt))
 		return kgsl_mmu_device_setstate(&device->mmu, flags);
 
 	num_iommu_units = kgsl_mmu_get_num_iommu_units(&device->mmu);
@@ -323,11 +602,13 @@ static void adreno_iommu_setstate(struct kgsl_device *device,
 	context = kgsl_context_get(device, context_id);
 
 	if (context == NULL)
-		return;
-	adreno_ctx = context->devctxt;
+		return 0;
+
+	adreno_ctx = ADRENO_CONTEXT(context);
+
+	result = kgsl_mmu_enable_clk(&device->mmu, KGSL_IOMMU_CONTEXT_USER);
 
-	if (kgsl_mmu_enable_clk(&device->mmu,
-				KGSL_IOMMU_CONTEXT_USER))
+	if (result)
 		goto done;
 
 	cmds += __adreno_add_idle_indirect_cmds(cmds,
@@ -429,17 +710,28 @@ static void adreno_iommu_setstate(struct kgsl_device *device,
 
 	sizedwords += (cmds - &link[0]);
 	if (sizedwords) {
+
 		/* invalidate all base pointers */
 		*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1);
 		*cmds++ = 0x7fff;
 		sizedwords += 2;
 		/* This returns the per context timestamp but we need to
 		 * use the global timestamp for iommu clock disablement */
-		adreno_ringbuffer_issuecmds(device, adreno_ctx,
+		result = adreno_ringbuffer_issuecmds(device, adreno_ctx,
 			KGSL_CMD_FLAGS_PMODE,
 			&link[0], sizedwords);
+
+		if (result) {
+			/* On error disable the IOMMU clock right away */
+			kgsl_mmu_disable_clk(&device->mmu);
+			goto done;
+		}
+
 		kgsl_mmu_disable_clk_on_ts(&device->mmu,
 				adreno_dev->ringbuffer.global_ts, true);
+
+		if (result)
+			goto done;
 	}
 
 	if (sizedwords > (sizeof(link)/sizeof(unsigned int))) {
@@ -448,9 +740,10 @@ static void adreno_iommu_setstate(struct kgsl_device *device,
 	}
 done:
 	kgsl_context_put(context);
+	return result;
 }
 
-static void adreno_gpummu_setstate(struct kgsl_device *device,
+static int adreno_gpummu_setstate(struct kgsl_device *device,
 					unsigned int context_id,
 					uint32_t flags)
 {
@@ -461,6 +754,7 @@ static void adreno_gpummu_setstate(struct kgsl_device *device,
 	unsigned int mh_mmu_invalidate = 0x00000003; /*invalidate all and tc */
 	struct kgsl_context *context;
 	struct adreno_context *adreno_ctx = NULL;
+	int ret = 0;
 
 	/*
 	 * Fix target freeze issue by adding TLB flush for each submit
@@ -477,8 +771,9 @@ static void adreno_gpummu_setstate(struct kgsl_device *device,
 	if (!kgsl_cff_dump_enable && adreno_dev->drawctxt_active) {
 		context = kgsl_context_get(device, context_id);
 		if (context == NULL)
-			return;
-		adreno_ctx = context->devctxt;
+			return -EINVAL;
+
+		adreno_ctx = ADRENO_CONTEXT(context);
 
 		if (flags & KGSL_MMUFLAGS_PTUPDATE) {
 			/* wait for graphics pipe to be idle */
@@ -552,7 +847,7 @@ static void adreno_gpummu_setstate(struct kgsl_device *device,
 			sizedwords += 2;
 		}
 
-		adreno_ringbuffer_issuecmds(device, adreno_ctx,
+		ret = adreno_ringbuffer_issuecmds(device, adreno_ctx,
 					KGSL_CMD_FLAGS_PMODE,
 					&link[0], sizedwords);
 
@@ -560,9 +855,11 @@ static void adreno_gpummu_setstate(struct kgsl_device *device,
 	} else {
 		kgsl_mmu_device_setstate(&device->mmu, flags);
 	}
+
+	return ret;
 }
 
-static void adreno_setstate(struct kgsl_device *device,
+static int adreno_setstate(struct kgsl_device *device,
 			unsigned int context_id,
 			uint32_t flags)
 {
@@ -571,6 +868,8 @@ static void adreno_setstate(struct kgsl_device *device,
 		return adreno_gpummu_setstate(device, context_id, flags);
 	else if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
 		return adreno_iommu_setstate(device, context_id, flags);
+
+	return 0;
 }
 
 static unsigned int
@@ -1188,6 +1487,10 @@ adreno_probe(struct platform_device *pdev)
 	if (status)
 		goto error_close_rb;
 
+	status = adreno_dispatcher_init(adreno_dev);
+	if (status)
+		goto error_close_device;
+
 	adreno_debugfs_init(device);
 
 	kgsl_pwrscale_init(device);
@@ -1196,6 +1499,8 @@ adreno_probe(struct platform_device *pdev)
 	device->flags &= ~KGSL_FLAGS_SOFT_RESET;
 	return 0;
 
+error_close_device:
+	kgsl_device_platform_remove(device);
 error_close_rb:
 	adreno_ringbuffer_close(&adreno_dev->ringbuffer);
 error:
@@ -1215,19 +1520,18 @@ static int __devexit adreno_remove(struct platform_device *pdev)
 	kgsl_pwrscale_detach_policy(device);
 	kgsl_pwrscale_close(device);
 
+	adreno_dispatcher_close(adreno_dev);
 	adreno_ringbuffer_close(&adreno_dev->ringbuffer);
 	kgsl_device_platform_remove(device);
 
 	return 0;
 }
 
-static int adreno_start(struct kgsl_device *device, unsigned int init_ram)
+static int adreno_init(struct kgsl_device *device)
 {
-	int status = -EINVAL;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	if (KGSL_STATE_DUMP_AND_FT != device->state)
-		kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
+	kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
 
 	/* Power up the device */
 	kgsl_pwrctrl_enable(device);
@@ -1250,10 +1554,9 @@ static int adreno_start(struct kgsl_device *device, unsigned int init_ram)
 	if (adreno_dev->gpurev == ADRENO_REV_UNKNOWN) {
 		KGSL_DRV_ERR(device, "Unknown chip ID %x\n",
 			adreno_dev->chip_id);
-		goto error_clk_off;
+		BUG_ON(1);
 	}
 
-
 	/*
 	 * Check if firmware supports the sync lock PM4 packets needed
 	 * for IOMMUv1
@@ -1265,7 +1568,32 @@ static int adreno_start(struct kgsl_device *device, unsigned int init_ram)
 		adreno_gpulist[adreno_dev->gpulist_index].sync_lock_pfp_ver))
 		device->mmu.flags |= KGSL_MMU_FLAGS_IOMMU_SYNC;
 
-	/* Set up the MMU */
+	/* Assign correct RBBM status register to hang detect regs
+	 */
+	ft_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
+
+	adreno_perfcounter_init(device);
+
+	/* Power down the device */
+	kgsl_pwrctrl_disable(device);
+
+	return 0;
+}
+
+static int adreno_start(struct kgsl_device *device)
+{
+	int status = -EINVAL;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	unsigned int state = device->state;
+
+	kgsl_cffdump_open(device);
+
+	kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
+
+	/* Power up the device */
+	kgsl_pwrctrl_enable(device);
+
+	/* Set up a2xx special case */
 	if (adreno_is_a2xx(adreno_dev)) {
 		/*
 		 * the MH_CLNT_INTF_CTRL_CONFIG registers aren't present
@@ -1279,20 +1607,6 @@ static int adreno_start(struct kgsl_device *device, unsigned int init_ram)
 		kgsl_mh_start(device);
 	}
 
-	/* Assign correct RBBM status register to hang detect regs
-	 */
-	ft_detect_regs[0] = adreno_dev->gpudev->reg_rbbm_status;
-
-	/* Add A3XX specific registers for hang detection */
-	if (adreno_is_a3xx(adreno_dev)) {
-		ft_detect_regs[6] = A3XX_RBBM_PERFCTR_SP_7_LO;
-		ft_detect_regs[7] = A3XX_RBBM_PERFCTR_SP_7_HI;
-		ft_detect_regs[8] = A3XX_RBBM_PERFCTR_SP_6_LO;
-		ft_detect_regs[9] = A3XX_RBBM_PERFCTR_SP_6_HI;
-		ft_detect_regs[10] = A3XX_RBBM_PERFCTR_SP_5_LO;
-		ft_detect_regs[11] = A3XX_RBBM_PERFCTR_SP_5_HI;
-	}
-
 	status = kgsl_mmu_start(device);
 	if (status)
 		goto error_clk_off;
@@ -1309,17 +1623,14 @@ static int adreno_start(struct kgsl_device *device, unsigned int init_ram)
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
 	device->ftbl->irqctrl(device, 1);
 
-	status = adreno_ringbuffer_start(&adreno_dev->ringbuffer, init_ram);
+	status = adreno_ringbuffer_start(&adreno_dev->ringbuffer);
 	if (status)
 		goto error_irq_off;
 
-	/*
-	 * While recovery is on we do not want timer to
-	 * fire and attempt to change any device state
-	 */
+	/* Start the dispatcher */
+	adreno_dispatcher_start(adreno_dev);
 
-	if (KGSL_STATE_DUMP_AND_FT != device->state)
-		mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
+	adreno_perfcounter_start(adreno_dev);
 
 	device->reset_counter++;
 
@@ -1332,7 +1643,11 @@ error_mmu_off:
 	kgsl_mmu_stop(&device->mmu);
 
 error_clk_off:
-	kgsl_pwrctrl_disable(device);
+	if (KGSL_STATE_DUMP_AND_FT != device->state) {
+		kgsl_pwrctrl_disable(device);
+		/* set the state back to original state */
+		kgsl_pwrctrl_set_state(device, state);
+	}
 
 	return status;
 }
@@ -1341,8 +1656,12 @@ static int adreno_stop(struct kgsl_device *device)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
+	if (adreno_dev->drawctxt_active)
+		kgsl_context_put(&adreno_dev->drawctxt_active->base);
+
 	adreno_dev->drawctxt_active = NULL;
 
+	adreno_dispatcher_stop(adreno_dev);
 	adreno_ringbuffer_stop(&adreno_dev->ringbuffer);
 
 	kgsl_mmu_stop(&device->mmu);
@@ -1356,790 +1675,59 @@ static int adreno_stop(struct kgsl_device *device)
 	/* Power down the device */
 	kgsl_pwrctrl_disable(device);
 
+	kgsl_cffdump_close(device->id);
+
 	return 0;
 }
 
-static void adreno_mark_context_status(struct kgsl_device *device,
-					int ft_status)
+/**
+ * adreno_reset() - Helper function to reset the GPU
+ * @device: Pointer to the KGSL device structure for the GPU
+ *
+ * Helper function to reset the GPU hardware by toggling the footswitch
+ */
+int adreno_reset(struct kgsl_device *device)
 {
-	struct kgsl_context *context;
-	int next = 0;
-	/*
-	 * Set the reset status of all contexts to
-	 * INNOCENT_CONTEXT_RESET_EXT except for the bad context
-	 * since thats the guilty party, if fault tolerance failed then
-	 * mark all as guilty
-	 */
+	int ret;
 
-	rcu_read_lock();
-	while ((context = idr_get_next(&device->context_idr, &next))) {
-		struct adreno_context *adreno_context = context->devctxt;
-		if (ft_status) {
-			context->reset_status =
-					KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
-			adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
-		} else if (KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT !=
-			context->reset_status) {
-			if (adreno_context->flags & (CTXT_FLAGS_GPU_HANG |
-				CTXT_FLAGS_GPU_HANG_FT))
-				context->reset_status =
-				KGSL_CTX_STAT_GUILTY_CONTEXT_RESET_EXT;
-			else
-				context->reset_status =
-				KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT;
-		}
-		next = next + 1;
-	}
-	rcu_read_unlock();
-}
+	ret = adreno_stop(device);
+	if (ret)
+		return ret;
 
-static void adreno_set_max_ts_for_bad_ctxs(struct kgsl_device *device)
-{
-	struct kgsl_context *context;
-	struct adreno_context *temp_adreno_context;
-	int next = 0;
+	ret = adreno_init(device);
+	if (ret)
+		return ret;
 
-	rcu_read_lock();
-	while ((context = idr_get_next(&device->context_idr, &next))) {
-		temp_adreno_context = context->devctxt;
-		if (temp_adreno_context->flags & CTXT_FLAGS_GPU_HANG) {
-			kgsl_sharedmem_writel(&device->memstore,
-				KGSL_MEMSTORE_OFFSET(context->id,
-				soptimestamp),
-				temp_adreno_context->timestamp);
-			kgsl_sharedmem_writel(&device->memstore,
-				KGSL_MEMSTORE_OFFSET(context->id,
-				eoptimestamp),
-				temp_adreno_context->timestamp);
-		}
-		next = next + 1;
-	}
-	rcu_read_unlock();
-}
+	ret = adreno_start(device);
 
-static void adreno_destroy_ft_data(struct adreno_ft_data *ft_data)
-{
-	vfree(ft_data->rb_buffer);
-	vfree(ft_data->bad_rb_buffer);
-	vfree(ft_data->good_rb_buffer);
-}
+	if (ret == 0) {
+		/*
+		 * If active_cnt is non-zero then the system was active before
+		 * going into a reset - put it back in that state
+		 */
 
-static int _find_start_of_cmd_seq(struct adreno_ringbuffer *rb,
-					unsigned int *ptr,
-					bool inc)
-{
-	int status = -EINVAL;
-	unsigned int val1;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int start_ptr = *ptr;
-
-	while ((start_ptr / sizeof(unsigned int)) != rb->wptr) {
-		if (inc)
-			start_ptr = adreno_ringbuffer_inc_wrapped(start_ptr,
-									size);
-		else
-			start_ptr = adreno_ringbuffer_dec_wrapped(start_ptr,
-									size);
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val1, start_ptr);
-		if (KGSL_CMD_IDENTIFIER == val1) {
-			if ((start_ptr / sizeof(unsigned int)) != rb->wptr)
-				start_ptr = adreno_ringbuffer_dec_wrapped(
-							start_ptr, size);
-				*ptr = start_ptr;
-				status = 0;
-				break;
-		}
+		if (atomic_read(&device->active_cnt))
+			kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 	}
-	return status;
+
+	return ret;
 }
 
-static int _find_cmd_seq_after_eop_ts(struct adreno_ringbuffer *rb,
-					unsigned int *rb_rptr,
-					unsigned int global_eop,
-					bool inc)
+static int adreno_getproperty(struct kgsl_device *device,
+				enum kgsl_property_type type,
+				void *value,
+				unsigned int sizebytes)
 {
 	int status = -EINVAL;
-	unsigned int temp_rb_rptr = *rb_rptr;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int val[3];
-	int i = 0;
-	bool check = false;
-
-	if (inc && temp_rb_rptr / sizeof(unsigned int) != rb->wptr)
-		return status;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-	do {
-		/*
-		 * when decrementing we need to decrement first and
-		 * then read make sure we cover all the data
-		 */
-		if (!inc)
-			temp_rb_rptr = adreno_ringbuffer_dec_wrapped(
-					temp_rb_rptr, size);
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i],
-					temp_rb_rptr);
-
-		if (check && ((inc && val[i] == global_eop) ||
-			(!inc && (val[i] ==
-			cp_type3_packet(CP_MEM_WRITE, 2) ||
-			val[i] == CACHE_FLUSH_TS)))) {
-			/* decrement i, i.e i = (i - 1 + 3) % 3 if
-			 * we are going forward, else increment i */
-			i = (i + 2) % 3;
-			if (val[i] == rb->device->memstore.gpuaddr +
-				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-						eoptimestamp)) {
-				int j = ((i + 2) % 3);
-				if ((inc && (val[j] == CACHE_FLUSH_TS ||
-						val[j] == cp_type3_packet(
-							CP_MEM_WRITE, 2))) ||
-					(!inc && val[j] == global_eop)) {
-						/* Found the global eop */
-						status = 0;
-						break;
-				}
-			}
-			/* if no match found then increment i again
-			 * since we decremented before matching */
-			i = (i + 1) % 3;
-		}
-		if (inc)
-			temp_rb_rptr = adreno_ringbuffer_inc_wrapped(
-						temp_rb_rptr, size);
-
-		i = (i + 1) % 3;
-		if (2 == i)
-			check = true;
-	} while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr);
-	/* temp_rb_rptr points to the command stream after global eop,
-	 * move backward till the start of command sequence */
-	if (!status) {
-		status = _find_start_of_cmd_seq(rb, &temp_rb_rptr, false);
-		if (!status) {
-			*rb_rptr = temp_rb_rptr;
-			KGSL_FT_INFO(rb->device,
-			"Offset of cmd sequence after eop timestamp: 0x%x\n",
-			temp_rb_rptr / sizeof(unsigned int));
-		}
-	}
-	if (status)
-		KGSL_FT_ERR(rb->device,
-		"Failed to find the command sequence after eop timestamp\n");
-	return status;
-}
+	switch (type) {
+	case KGSL_PROP_DEVICE_INFO:
+		{
+			struct kgsl_devinfo devinfo;
 
-static int _find_hanging_ib_sequence(struct adreno_ringbuffer *rb,
-				unsigned int *rb_rptr,
-				unsigned int ib1)
-{
-	int status = -EINVAL;
-	unsigned int temp_rb_rptr = *rb_rptr;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int val[2];
-	int i = 0;
-	bool check = false;
-	bool ctx_switch = false;
-
-	while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
-
-		if (check && val[i] == ib1) {
-			/* decrement i, i.e i = (i - 1 + 2) % 2 */
-			i = (i + 1) % 2;
-			if (adreno_cmd_is_ib(val[i])) {
-				/* go till start of command sequence */
-				status = _find_start_of_cmd_seq(rb,
-						&temp_rb_rptr, false);
-
-				KGSL_FT_INFO(rb->device,
-				"Found the hanging IB at offset 0x%x\n",
-				temp_rb_rptr / sizeof(unsigned int));
-				break;
-			}
-			/* if no match the increment i since we decremented
-			 * before checking */
-			i = (i + 1) % 2;
-		}
-		/* Make sure you do not encounter a context switch twice, we can
-		 * encounter it once for the bad context as the start of search
-		 * can point to the context switch */
-		if (val[i] == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
-			if (ctx_switch) {
-				KGSL_FT_ERR(rb->device,
-				"Context switch encountered before bad "
-				"IB found\n");
-				break;
-			}
-			ctx_switch = true;
-		}
-		i = (i + 1) % 2;
-		if (1 == i)
-			check = true;
-		temp_rb_rptr = adreno_ringbuffer_inc_wrapped(temp_rb_rptr,
-								size);
-	}
-	if  (!status)
-		*rb_rptr = temp_rb_rptr;
-	return status;
-}
-
-static int adreno_setup_ft_data(struct kgsl_device *device,
-					struct adreno_ft_data *ft_data)
-{
-	int ret = 0;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
-	struct kgsl_context *context;
-	struct adreno_context *adreno_context;
-	unsigned int rb_rptr = rb->wptr * sizeof(unsigned int);
-
-	memset(ft_data, 0, sizeof(*ft_data));
-	ft_data->start_of_replay_cmds = 0xFFFFFFFF;
-	ft_data->replay_for_snapshot = 0xFFFFFFFF;
-
-	adreno_regread(device, REG_CP_IB1_BASE, &ft_data->ib1);
-
-	kgsl_sharedmem_readl(&device->memstore, &ft_data->context_id,
-			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-			current_context));
-
-	kgsl_sharedmem_readl(&device->memstore,
-			&ft_data->global_eop,
-			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-			eoptimestamp));
-
-	ft_data->rb_buffer = vmalloc(rb->buffer_desc.size);
-	if (!ft_data->rb_buffer) {
-		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
-				rb->buffer_desc.size);
-		return -ENOMEM;
-	}
-
-	ft_data->bad_rb_buffer = vmalloc(rb->buffer_desc.size);
-	if (!ft_data->bad_rb_buffer) {
-		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
-				rb->buffer_desc.size);
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	ft_data->good_rb_buffer = vmalloc(rb->buffer_desc.size);
-	if (!ft_data->good_rb_buffer) {
-		KGSL_MEM_ERR(device, "vmalloc(%d) failed\n",
-				rb->buffer_desc.size);
-		ret = -ENOMEM;
-		goto done;
-	}
-
-	ft_data->status =  0;
-
-	/* find the start of bad command sequence in rb */
-	context = idr_find(&device->context_idr, ft_data->context_id);
-	/* Look for the command stream that is right after the global eop */
-
-	if (!context) {
-		/*
-		 * If there is no context then fault tolerance does not need to
-		 * replay anything, just reset GPU and thats it
-		 */
-		goto done;
-	}
-	ret = _find_cmd_seq_after_eop_ts(rb, &rb_rptr,
-					ft_data->global_eop + 1, false);
-	if (ret)
-		goto done;
-
-	ft_data->start_of_replay_cmds = rb_rptr;
-
-	if (!adreno_dev->ft_policy)
-		adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
-
-	ft_data->ft_policy = adreno_dev->ft_policy;
-
-
-	adreno_context = context->devctxt;
-	if (adreno_context->flags & CTXT_FLAGS_PREAMBLE) {
-		if (ft_data->ib1) {
-			ret = _find_hanging_ib_sequence(rb,
-					&rb_rptr, ft_data->ib1);
-			if (ret) {
-				KGSL_FT_ERR(device,
-				"Start not found for replay IB sequence\n");
-				ret = 0;
-				goto done;
-			}
-			ft_data->start_of_replay_cmds = rb_rptr;
-			ft_data->replay_for_snapshot = rb_rptr;
-		}
-	}
-
-done:
-	if (ret) {
-		vfree(ft_data->rb_buffer);
-		vfree(ft_data->bad_rb_buffer);
-		vfree(ft_data->good_rb_buffer);
-	}
-	return ret;
-}
-
-static int
-_adreno_check_long_ib(struct kgsl_device *device)
-{
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int curr_global_ts = 0;
-
-	/* check if the global ts is still the same */
-	kgsl_sharedmem_readl(&device->memstore,
-			&curr_global_ts,
-			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-			eoptimestamp));
-
-	/* Mark long ib as handled */
-	adreno_dev->long_ib = 0;
-
-	if (curr_global_ts == adreno_dev->long_ib_ts) {
-		KGSL_FT_ERR(device,
-			"IB ran too long, invalidate ctxt\n");
-		return 1;
-	} else {
-		/* Do nothing GPU has gone ahead */
-		KGSL_FT_INFO(device, "false long ib detection return\n");
-		return 0;
-	}
-}
-
-static int
-_adreno_ft_restart_device(struct kgsl_device *device,
-		   struct kgsl_context *context,
-		   struct adreno_ft_data *ft_data)
-{
-
-	struct adreno_context *adreno_context = context->devctxt;
-
-	/* restart device */
-	if (adreno_stop(device)) {
-		KGSL_FT_ERR(device, "Device stop failed\n");
-		return 1;
-	}
-
-	if (adreno_start(device, true)) {
-		KGSL_FT_ERR(device, "Device start failed\n");
-		return 1;
-	}
-
-	if (context)
-		kgsl_mmu_setstate(&device->mmu, adreno_context->pagetable,
-			KGSL_MEMSTORE_GLOBAL);
-
-	/* If iommu is used then we need to make sure that the iommu clocks
-	 * are on since there could be commands in pipeline that touch iommu */
-	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
-		if (kgsl_mmu_enable_clk(&device->mmu,
-				KGSL_IOMMU_CONTEXT_USER))
-			return 1;
-	}
-
-	return 0;
-}
-
-static inline void
-_adreno_debug_ft_info(struct kgsl_device *device,
-			struct adreno_ft_data *ft_data)
-{
-
-	/*
-	 * Dumping rb is a very useful tool to debug FT.
-	 * It will tell us if we are extracting the rb correctly
-	 * NOP'ing the right IB, skipping the EOF correctly etc.
-	 */
-	if (device->ft_log >= 7)  {
-
-		/* Print fault tolerance data here */
-		KGSL_FT_INFO(device, "Temp RB buffer size 0x%X\n",
-			ft_data->rb_size);
-		adreno_dump_rb(device, ft_data->rb_buffer,
-			ft_data->rb_size<<2, 0, ft_data->rb_size);
-
-		KGSL_FT_INFO(device, "Bad RB buffer size 0x%X\n",
-			ft_data->bad_rb_size);
-		adreno_dump_rb(device, ft_data->bad_rb_buffer,
-			ft_data->bad_rb_size<<2, 0, ft_data->bad_rb_size);
-
-		KGSL_FT_INFO(device, "Good RB buffer size 0x%X\n",
-			ft_data->good_rb_size);
-		adreno_dump_rb(device, ft_data->good_rb_buffer,
-			ft_data->good_rb_size<<2, 0, ft_data->good_rb_size);
-
-	}
-}
-
-static int
-_adreno_ft_resubmit_rb(struct kgsl_device *device,
-			struct adreno_ringbuffer *rb,
-			struct kgsl_context *context,
-			struct adreno_ft_data *ft_data,
-			unsigned int *buff, unsigned int size)
-{
-	unsigned int ret = 0;
-
-	_adreno_debug_ft_info(device, ft_data);
-
-	if (_adreno_ft_restart_device(device, context, ft_data))
-		return 1;
-
-	if (size) {
-
-		/* submit commands and wait for them to pass */
-		adreno_ringbuffer_restore(rb, buff, size);
-
-		ret = adreno_idle(device);
-	}
-
-	return ret;
-}
-
-
-static int
-_adreno_ft(struct kgsl_device *device,
-			struct adreno_ft_data *ft_data)
-{
-	int ret = 0, i;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
-	struct kgsl_context *context;
-	struct adreno_context *adreno_context = NULL;
-	struct adreno_context *last_active_ctx = adreno_dev->drawctxt_active;
-
-	context = kgsl_context_get(device, ft_data->context_id);
-
-	if (context == NULL) {
-		KGSL_FT_CRIT(device, "Last context unknown id:%d\n",
-			ft_data->context_id);
-	} else {
-		adreno_context = context->devctxt;
-		adreno_context->flags |= CTXT_FLAGS_GPU_HANG;
-		/*
-		 * set the invalid ts flag to 0 for this context since we have
-		 * detected a hang for it
-		 */
-		context->wait_on_invalid_ts = false;
-
-		/*
-		 *  This flag will be set by userspace for contexts
-		 *  that do not want to be fault tolerant (ex: OPENCL)
-		 */
-		if (adreno_context->flags & CTXT_FLAGS_NO_FAULT_TOLERANCE) {
-			KGSL_FT_ERR(device,
-			"No FT set for this context play good cmds\n");
-			goto play_good_cmds;
-		}
-
-	}
-
-	/*
-	 * Extract valid contents from rb which can still be executed after
-	 * hang
-	 */
-	adreno_ringbuffer_extract(rb, ft_data);
-
-	/* Check if we detected a long running IB,
-	 * if true do not attempt replay of bad cmds */
-	if (adreno_dev->long_ib) {
-		if (_adreno_check_long_ib(device)) {
-			ft_data->status = 1;
-			_adreno_debug_ft_info(device, ft_data);
-			goto play_good_cmds;
-		} else {
-			adreno_context->flags &= ~CTXT_FLAGS_GPU_HANG;
-			return 0;
-		}
-	}
-
-	/* Do not try the bad commands if  hang is due to a fault */
-	if (device->mmu.fault) {
-		KGSL_FT_ERR(device, "MMU fault skipping bad cmds\n");
-		device->mmu.fault = 0;
-		goto play_good_cmds;
-	}
-
-	if (ft_data->ft_policy & KGSL_FT_DISABLE) {
-		KGSL_FT_ERR(device, "NO FT policy play only good cmds\n");
-		goto play_good_cmds;
-	}
-
-	if (ft_data->ft_policy & KGSL_FT_REPLAY) {
-
-		ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
-				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
-
-		if (ret) {
-			KGSL_FT_ERR(device, "Replay unsuccessful\n");
-			ft_data->status = 1;
-		} else
-			goto play_good_cmds;
-	}
-
-	if (ft_data->ft_policy & KGSL_FT_SKIPIB) {
-
-		for (i = 0; i < ft_data->bad_rb_size; i++) {
-			if ((ft_data->bad_rb_buffer[i] ==
-					CP_HDR_INDIRECT_BUFFER_PFD) &&
-				(ft_data->bad_rb_buffer[i+1] == ft_data->ib1)) {
-
-				ft_data->bad_rb_buffer[i] = cp_nop_packet(2);
-				ft_data->bad_rb_buffer[i+1] =
-							KGSL_NOP_IB_IDENTIFIER;
-				ft_data->bad_rb_buffer[i+2] =
-							KGSL_NOP_IB_IDENTIFIER;
-				break;
-			}
-		}
-
-		if ((i == (ft_data->bad_rb_size)) || (!ft_data->ib1)) {
-			KGSL_FT_ERR(device, "Bad IB to NOP not found\n");
-			ft_data->status = 1;
-			goto play_good_cmds;
-		}
-
-		ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
-				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
-
-		if (ret) {
-			KGSL_FT_ERR(device, "NOP faulty IB unsuccessful\n");
-			ft_data->status = 1;
-		} else {
-			ft_data->status = 0;
-			goto play_good_cmds;
-		}
-	}
-
-	if (ft_data->ft_policy & KGSL_FT_SKIPFRAME) {
-
-		for (i = 0; i < ft_data->bad_rb_size; i++) {
-			if (ft_data->bad_rb_buffer[i] ==
-					KGSL_END_OF_FRAME_IDENTIFIER) {
-				ft_data->bad_rb_buffer[0] = cp_nop_packet(i);
-				break;
-			}
-		}
-
-		/* EOF not found in RB, discard till EOF in
-		   next IB submission */
-		if (i == ft_data->bad_rb_size) {
-			adreno_context->flags |= CTXT_FLAGS_SKIP_EOF;
-			KGSL_FT_INFO(device,
-			"EOF not found in RB, skip next issueib till EOF\n");
-			ft_data->bad_rb_buffer[0] = cp_nop_packet(i);
-		}
-
-		ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
-				ft_data->bad_rb_buffer, ft_data->bad_rb_size);
-
-		if (ret) {
-			KGSL_FT_ERR(device, "Skip EOF unsuccessful\n");
-			ft_data->status = 1;
-		} else {
-			ft_data->status = 0;
-			goto play_good_cmds;
-		}
-	}
-
-play_good_cmds:
-
-	if (ft_data->status)
-		KGSL_FT_ERR(device, "Bad context commands failed\n");
-	else {
-		KGSL_FT_INFO(device, "Bad context commands success\n");
-
-		if (adreno_context) {
-			adreno_context->flags = (adreno_context->flags &
-				~CTXT_FLAGS_GPU_HANG) | CTXT_FLAGS_GPU_HANG_FT;
-		}
-		adreno_dev->drawctxt_active = last_active_ctx;
-	}
-
-	ret = _adreno_ft_resubmit_rb(device, rb, context, ft_data,
-			ft_data->good_rb_buffer, ft_data->good_rb_size);
-
-	if (ret) {
-		/* If we fail here we can try to invalidate another
-		 * context and try fault tolerance again */
-		ret = -EAGAIN;
-		KGSL_FT_ERR(device, "Playing good commands unsuccessful\n");
-		goto done;
-	} else
-		KGSL_FT_INFO(device, "Playing good commands successful\n");
-
-	/* ringbuffer now has data from the last valid context id,
-	 * so restore the active_ctx to the last valid context */
-	if (ft_data->last_valid_ctx_id) {
-		struct kgsl_context *last_ctx = kgsl_context_get(device,
-			ft_data->last_valid_ctx_id);
-
-		if (last_ctx)
-			adreno_dev->drawctxt_active = last_ctx->devctxt;
-
-		kgsl_context_put(last_ctx);
-	}
-
-done:
-	/* Turn off iommu clocks */
-	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype())
-		kgsl_mmu_disable_clk_on_ts(&device->mmu, 0, false);
-
-	kgsl_context_put(context);
-	return ret;
-}
-
-static int
-adreno_ft(struct kgsl_device *device,
-			struct adreno_ft_data *ft_data)
-{
-	int ret = 0;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
-
-	KGSL_FT_INFO(device,
-	"Start Parameters: IB1: 0x%X, "
-	"Bad context_id: %u, global_eop: 0x%x\n",
-	ft_data->ib1, ft_data->context_id, ft_data->global_eop);
-
-	KGSL_FT_INFO(device, "Last issued global timestamp: %x\n",
-			rb->global_ts);
-
-	/* We may need to replay commands multiple times based on whether
-	 * multiple contexts hang the GPU */
-	while (true) {
-
-		ret = _adreno_ft(device, ft_data);
-
-		if (-EAGAIN == ret) {
-			/* setup new fault tolerance parameters and retry, this
-			 * means more than 1 contexts are causing hang */
-			adreno_destroy_ft_data(ft_data);
-			ret = adreno_setup_ft_data(device, ft_data);
-			if (ret)
-				goto done;
-			KGSL_FT_INFO(device,
-			"Retry. Parameters: "
-			"IB1: 0x%X, Bad context_id: %u, global_eop: 0x%x\n",
-			ft_data->ib1, ft_data->context_id,
-			ft_data->global_eop);
-		} else {
-			break;
-		}
-	}
-
-	if (ret)
-		goto done;
-
-	/* Restore correct states after fault tolerance */
-	if (adreno_dev->drawctxt_active)
-		device->mmu.hwpagetable =
-			adreno_dev->drawctxt_active->pagetable;
-	else
-		device->mmu.hwpagetable = device->mmu.defaultpagetable;
-	kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-			eoptimestamp), rb->global_ts);
-
-	/* switch to NULL ctxt */
-	if (adreno_dev->drawctxt_active != NULL)
-		adreno_drawctxt_switch(adreno_dev, NULL, 0);
-
-done:
-	adreno_set_max_ts_for_bad_ctxs(device);
-	adreno_mark_context_status(device, ret);
-	KGSL_FT_ERR(device, "policy 0x%X status 0x%x\n",
-			ft_data->ft_policy, ret);
-	return ret;
-}
-
-int
-adreno_dump_and_exec_ft(struct kgsl_device *device)
-{
-	int result = -ETIMEDOUT;
-	struct adreno_ft_data ft_data;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-	unsigned int curr_pwrlevel;
-
-	if (device->state == KGSL_STATE_HUNG)
-		goto done;
-	if (device->state == KGSL_STATE_DUMP_AND_FT) {
-		mutex_unlock(&device->mutex);
-		wait_for_completion(&device->ft_gate);
-		mutex_lock(&device->mutex);
-		if (device->state != KGSL_STATE_HUNG)
-			result = 0;
-	} else {
-		kgsl_pwrctrl_set_state(device, KGSL_STATE_DUMP_AND_FT);
-		INIT_COMPLETION(device->ft_gate);
-		/* Detected a hang */
-
-		/* Run fault tolerance at max power level */
-		curr_pwrlevel = pwr->active_pwrlevel;
-		kgsl_pwrctrl_pwrlevel_change(device, pwr->max_pwrlevel);
-
-		/* Get the fault tolerance data as soon as hang is detected */
-		result = adreno_setup_ft_data(device, &ft_data);
-
-		/*
-		 * If long ib is detected, do not attempt postmortem or
-		 * snapshot, if GPU is still executing commands
-		 * we will get errors
-		 */
-		if (!adreno_dev->long_ib) {
-			/*
-			 * Trigger an automatic dump of the state to
-			 * the console
-			 */
-			kgsl_postmortem_dump(device, 0);
-
-			/*
-			* Make a GPU snapshot.  For now, do it after the
-			* PM dump so we can at least be sure the PM dump
-			* will work as it always has
-			*/
-			kgsl_device_snapshot(device, 1);
-		}
-
-		if (!result) {
-			result = adreno_ft(device, &ft_data);
-			adreno_destroy_ft_data(&ft_data);
-		}
-
-		/* restore power level */
-		kgsl_pwrctrl_pwrlevel_change(device, curr_pwrlevel);
-
-		if (result) {
-			kgsl_pwrctrl_set_state(device, KGSL_STATE_HUNG);
-		} else {
-			kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
-			mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
-		}
-		complete_all(&device->ft_gate);
-	}
-done:
-	return result;
-}
-EXPORT_SYMBOL(adreno_dump_and_exec_ft);
-
-static int adreno_getproperty(struct kgsl_device *device,
-				enum kgsl_property_type type,
-				void *value,
-				unsigned int sizebytes)
-{
-	int status = -EINVAL;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-
-	switch (type) {
-	case KGSL_PROP_DEVICE_INFO:
-		{
-			struct kgsl_devinfo devinfo;
-
-			if (sizebytes != sizeof(devinfo)) {
-				status = -EINVAL;
+			if (sizebytes != sizeof(devinfo)) {
+				status = -EINVAL;
 				break;
 			}
 
@@ -2260,199 +1848,121 @@ static int adreno_setproperty(struct kgsl_device *device,
 			status = 0;
 		}
 		break;
-	case KGSL_PROP_FAULT_TOLERANCE: {
-			struct kgsl_ft_config ftd;
-
-			if (adreno_dev->ft_user_control == 0)
-				break;
-
-			if (sizebytes != sizeof(ftd))
-				break;
+	default:
+		break;
+	}
 
-			if (copy_from_user(&ftd, (void __user *) value,
-							   sizeof(ftd))) {
-				status = -EFAULT;
-				break;
-			}
+	return status;
+}
 
-			if (ftd.ft_policy)
-				adreno_dev->ft_policy = ftd.ft_policy;
-			else
-				adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
+/**
+ * adreno_hw_isidle() - Check if the GPU core is idle
+ * @device: Pointer to the KGSL device structure for the GPU
+ *
+ * Return true if the RBBM status register for the GPU type indicates that the
+ * hardware is idle
+ */
+static bool adreno_hw_isidle(struct kgsl_device *device)
+{
+	unsigned int reg_rbbm_status;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 
-			if (ftd.ft_pf_policy)
-				adreno_dev->ft_pf_policy = ftd.ft_policy;
-			else
-				adreno_dev->ft_pf_policy =
-					KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
+	/* Don't consider ourselves idle if there is an IRQ pending */
+	if (adreno_dev->gpudev->irq_pending(adreno_dev))
+		return false;
 
-			if (ftd.ft_pm_dump)
-				device->pm_dump_enable = 1;
-			else
-				device->pm_dump_enable = 0;
+	/* Read the correct RBBM status for the GPU type */
+	adreno_regread(device,
+		adreno_dev->gpudev->reg_rbbm_status,
+		&reg_rbbm_status);
 
-		}
-		break;
-	default:
-		break;
+	if (adreno_is_a2xx(adreno_dev)) {
+		if (reg_rbbm_status == 0x110)
+			return true;
+	} else if (adreno_is_a3xx(adreno_dev)) {
+		if (!(reg_rbbm_status & 0x80000000))
+			return true;
+	} else {
+		BUG();
 	}
 
-	return status;
+	return false;
 }
 
-static int adreno_ringbuffer_drain(struct kgsl_device *device,
-	unsigned int *regs)
+/**
+ * adreno_isidle() - return true if the GPU hardware is idle
+ * @device: Pointer to the KGSL device structure for the GPU
+ *
+ * Return true if the GPU hardware is idle and there are no commands pending in
+ * the ringbuffer
+ */
+static bool adreno_isidle(struct kgsl_device *device)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
-	unsigned long wait;
-	unsigned long timeout = jiffies + msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
 
-	if (!(rb->flags & KGSL_FLAGS_STARTED))
-		return 0;
-
-	/*
-	 * The first time into the loop, wait for 100 msecs and kick wptr again
-	 * to ensure that the hardware has updated correctly.  After that, kick
-	 * it periodically every KGSL_TIMEOUT_PART msecs until the timeout
-	 * expires
-	 */
-
-	wait = jiffies + msecs_to_jiffies(100);
-
-	do {
-		if (time_after(jiffies, wait)) {
-			/* Check to see if the core is hung */
-			if (adreno_ft_detect(device, regs))
-				return -ETIMEDOUT;
+	/* If the device isn't active, don't force it on. */
+	if (device->state != KGSL_STATE_ACTIVE)
+		return true;
 
-			wait = jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART);
-		}
-		GSL_RB_GET_READPTR(rb, &rb->rptr);
+	GSL_RB_GET_READPTR(rb, &rb->rptr);
 
-		if (time_after(jiffies, timeout)) {
-			KGSL_DRV_ERR(device, "rptr: %x, wptr: %x\n",
-				rb->rptr, rb->wptr);
-			return -ETIMEDOUT;
-		}
-	} while (rb->rptr != rb->wptr);
+	if (rb->rptr == rb->wptr)
+		return adreno_hw_isidle(device);
 
-	return 0;
+	return false;
 }
 
-/* Caller must hold the device mutex. */
+/**
+ * adreno_idle() - wait for the GPU hardware to go idle
+ * @device: Pointer to the KGSL device structure for the GPU
+ *
+ * Wait up to ADRENO_IDLE_TIMEOUT milliseconds for the GPU hardware to go quiet.
+ */
+
 int adreno_idle(struct kgsl_device *device)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int rbbm_status;
-	unsigned long wait_time;
-	unsigned long wait_time_part;
-	unsigned int prev_reg_val[ft_detect_regs_count];
-
-	memset(prev_reg_val, 0, sizeof(prev_reg_val));
+	unsigned long wait = jiffies + msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
 
-	kgsl_cffdump_regpoll(device->id,
-		adreno_dev->gpudev->reg_rbbm_status << 2,
-		0x00000000, 0x80000000);
-
-retry:
-	/* First, wait for the ringbuffer to drain */
-	if (adreno_ringbuffer_drain(device, prev_reg_val))
-		goto err;
-
-	/* now, wait for the GPU to finish its operations */
-	wait_time = jiffies + msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
-	wait_time_part = jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART);
+	/*
+	 * Make sure the device mutex is held so the dispatcher can't send any
+	 * more commands to the hardware
+	 */
 
-	while (time_before(jiffies, wait_time)) {
-		adreno_regread(device, adreno_dev->gpudev->reg_rbbm_status,
-			&rbbm_status);
-		if (adreno_is_a2xx(adreno_dev)) {
-			if (rbbm_status == 0x110)
-				return 0;
-		} else {
-			if (!(rbbm_status & 0x80000000))
-				return 0;
-		}
+	BUG_ON(!mutex_is_locked(&device->mutex));
 
-		/* Dont wait for timeout, detect hang faster.
-		 */
-		if (time_after(jiffies, wait_time_part)) {
-				wait_time_part = jiffies +
-					msecs_to_jiffies(KGSL_TIMEOUT_PART);
-				if ((adreno_ft_detect(device, prev_reg_val)))
-					goto err;
-		}
+	if (adreno_is_a2xx(adreno_dev))
+		kgsl_cffdump_regpoll(device,
+			adreno_dev->gpudev->reg_rbbm_status << 2, 0x110, 0x110);
+	else
+		kgsl_cffdump_regpoll(device,
+			adreno_dev->gpudev->reg_rbbm_status << 2, 0,
+			0x80000000);
 
+	while (time_before(jiffies, wait)) {
+		if (adreno_isidle(device))
+			return 0;
 	}
 
-err:
-	KGSL_DRV_ERR(device, "spun too long waiting for RB to idle\n");
-	if (KGSL_STATE_DUMP_AND_FT != device->state &&
-		!adreno_dump_and_exec_ft(device)) {
-		wait_time = jiffies + ADRENO_IDLE_TIMEOUT;
-		goto retry;
-	}
+	kgsl_postmortem_dump(device, 0);
+
 	return -ETIMEDOUT;
 }
 
 /**
- * is_adreno_rbbm_status_idle - Check if GPU core is idle by probing
- * rbbm_status register
- * @device - Pointer to the GPU device whose idle status is to be
- * checked
- * @returns - Returns whether the core is idle (based on rbbm_status)
- * false if the core is active, true if the core is idle
+ * adreno_drain() - Drain the dispatch queue
+ * @device: Pointer to the KGSL device structure for the GPU
+ *
+ * Tell the dispatcher to pause - this has the effect of draining the inflight
+ * command batches
  */
-static bool is_adreno_rbbm_status_idle(struct kgsl_device *device)
-{
-	unsigned int reg_rbbm_status;
-	bool status = false;
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-
-	/* Is the core idle? */
-	adreno_regread(device,
-		adreno_dev->gpudev->reg_rbbm_status,
-		&reg_rbbm_status);
-
-	if (adreno_is_a2xx(adreno_dev)) {
-		if (reg_rbbm_status == 0x110)
-			status = true;
-	} else {
-		if (!(reg_rbbm_status & 0x80000000))
-			status = true;
-	}
-	return status;
-}
-
-static unsigned int adreno_isidle(struct kgsl_device *device)
+static int adreno_drain(struct kgsl_device *device)
 {
-	int status = false;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
-	WARN_ON(device->state == KGSL_STATE_INIT);
-	/* If the device isn't active, don't force it on. */
-	if (device->state == KGSL_STATE_ACTIVE) {
-		/* Is the ring buffer is empty? */
-		GSL_RB_GET_READPTR(rb, &rb->rptr);
-		if (!device->active_cnt && (rb->rptr == rb->wptr)) {
-			/*
-			 * Are there interrupts pending? If so then pretend we
-			 * are not idle - this avoids the possiblity that we go
-			 * to a lower power state without handling interrupts
-			 * first.
-			 */
-
-			if (!adreno_dev->gpudev->irq_pending(adreno_dev)) {
-				/* Is the core idle? */
-				status = is_adreno_rbbm_status_idle(device);
-			}
-		}
-	} else {
-		status = true;
-	}
-	return status;
+	adreno_dispatcher_pause(adreno_dev);
+	return 0;
 }
 
 /* Caller must hold the device mutex. */
@@ -2476,20 +1986,20 @@ struct kgsl_memdesc *adreno_find_ctxtmem(struct kgsl_device *device,
 	unsigned int pt_base, unsigned int gpuaddr, unsigned int size)
 {
 	struct kgsl_context *context;
-	struct adreno_context *adreno_context = NULL;
 	int next = 0;
 	struct kgsl_memdesc *desc = NULL;
 
-	rcu_read_lock();
+	read_lock(&device->context_lock);
 	while (1) {
 		context = idr_get_next(&device->context_idr, &next);
 		if (context == NULL)
 			break;
 
-		adreno_context = (struct adreno_context *)context->devctxt;
-
-		if (kgsl_mmu_pt_equal(&device->mmu, adreno_context->pagetable,
+		if (kgsl_mmu_pt_equal(&device->mmu, context->pagetable,
 					pt_base)) {
+			struct adreno_context *adreno_context;
+
+			adreno_context = ADRENO_CONTEXT(context);
 			desc = &adreno_context->gpustate;
 			if (kgsl_gpuaddr_in_memdesc(desc, gpuaddr, size))
 				break;
@@ -2501,7 +2011,7 @@ struct kgsl_memdesc *adreno_find_ctxtmem(struct kgsl_device *device,
 		next = next + 1;
 		desc = NULL;
 	}
-	rcu_read_unlock();
+	read_unlock(&device->context_lock);
 	return desc;
 }
 
@@ -2571,7 +2081,7 @@ void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 	if (!in_interrupt())
 		kgsl_pre_hwaccess(device);
 
-	trace_kgsl_regwrite(device, offsetwords, value);
+	kgsl_trace_regwrite(device, offsetwords, value);
 
 	kgsl_cffdump_regwrite(device->id, offsetwords << 2, value);
 	reg = (unsigned int *)(device->reg_virt + (offsetwords << 2));
@@ -2582,366 +2092,6 @@ void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 	__raw_writel(value, reg);
 }
 
-static unsigned int _get_context_id(struct kgsl_context *k_ctxt)
-{
-	unsigned int context_id = KGSL_MEMSTORE_GLOBAL;
-	if (k_ctxt != NULL) {
-		struct adreno_context *a_ctxt = k_ctxt->devctxt;
-		if (k_ctxt->id == KGSL_CONTEXT_INVALID || a_ctxt == NULL)
-			context_id = KGSL_CONTEXT_INVALID;
-		else if (a_ctxt->flags & CTXT_FLAGS_PER_CONTEXT_TS)
-			context_id = k_ctxt->id;
-	}
-
-	return context_id;
-}
-
-static unsigned int adreno_check_hw_ts(struct kgsl_device *device,
-		struct kgsl_context *context, unsigned int timestamp)
-{
-	int status = 0;
-	unsigned int ref_ts, enableflag;
-	unsigned int context_id = _get_context_id(context);
-
-	/*
-	 * If the context ID is invalid, we are in a race with
-	 * the context being destroyed by userspace so bail.
-	 */
-	if (context_id == KGSL_CONTEXT_INVALID) {
-		KGSL_DRV_WARN(device, "context was detached");
-		return -EINVAL;
-	}
-
-	status = kgsl_check_timestamp(device, context, timestamp);
-	if (status)
-		return status;
-
-	kgsl_sharedmem_readl(&device->memstore, &enableflag,
-			KGSL_MEMSTORE_OFFSET(context_id, ts_cmp_enable));
-	/*
-	 * Barrier is needed here to make sure the read from memstore
-	 * has posted
-	 */
-
-	mb();
-
-	if (enableflag) {
-		kgsl_sharedmem_readl(&device->memstore, &ref_ts,
-				KGSL_MEMSTORE_OFFSET(context_id,
-					ref_wait_ts));
-
-		/* Make sure the memstore read has posted */
-		mb();
-		if (timestamp_cmp(ref_ts, timestamp) >= 0) {
-			kgsl_sharedmem_writel(&device->memstore,
-					KGSL_MEMSTORE_OFFSET(context_id,
-						ref_wait_ts), timestamp);
-			/* Make sure the memstore write is posted */
-			wmb();
-		}
-	} else {
-		kgsl_sharedmem_writel(&device->memstore,
-				KGSL_MEMSTORE_OFFSET(context_id,
-					ref_wait_ts), timestamp);
-		enableflag = 1;
-		kgsl_sharedmem_writel(&device->memstore,
-				KGSL_MEMSTORE_OFFSET(context_id,
-					ts_cmp_enable), enableflag);
-		/* Make sure the memstore write gets posted */
-		wmb();
-
-		/*
-		 * submit a dummy packet so that even if all
-		 * commands upto timestamp get executed we will still
-		 * get an interrupt
-		 */
-
-		if (context && device->state != KGSL_STATE_SLUMBER)
-			adreno_ringbuffer_issuecmds(device, context->devctxt,
-					KGSL_CMD_FLAGS_GET_INT, NULL, 0);
-	}
-
-	return 0;
-}
-
-/* Return 1 if the event timestmp has already passed, 0 if it was marked */
-static int adreno_next_event(struct kgsl_device *device,
-		struct kgsl_event *event)
-{
-	return adreno_check_hw_ts(device, event->context, event->timestamp);
-}
-
-static int adreno_check_interrupt_timestamp(struct kgsl_device *device,
-		struct kgsl_context *context, unsigned int timestamp)
-{
-	int status;
-
-	mutex_lock(&device->mutex);
-	status = adreno_check_hw_ts(device, context, timestamp);
-	mutex_unlock(&device->mutex);
-
-	return status;
-}
-
-/*
- wait_event_interruptible_timeout checks for the exit condition before
- placing a process in wait q. For conditional interrupts we expect the
- process to already be in its wait q when its exit condition checking
- function is called.
-*/
-#define kgsl_wait_event_interruptible_timeout(wq, condition, timeout, io)\
-({									\
-	long __ret = timeout;						\
-	if (io)						\
-		__wait_io_event_interruptible_timeout(wq, condition, __ret);\
-	else						\
-		__wait_event_interruptible_timeout(wq, condition, __ret);\
-	__ret;								\
-})
-
-
-
-unsigned int adreno_ft_detect(struct kgsl_device *device,
-						unsigned int *prev_reg_val)
-{
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int curr_reg_val[ft_detect_regs_count];
-	unsigned int fast_hang_detected = 1;
-	unsigned int long_ib_detected = 1;
-	unsigned int i;
-	static unsigned long next_hang_detect_time;
-	static unsigned int prev_global_ts;
-	unsigned int curr_global_ts = 0;
-	unsigned int curr_context_id = 0;
-	static struct adreno_context *curr_context;
-	static struct kgsl_context *context;
-
-	if (!adreno_dev->fast_hang_detect)
-		fast_hang_detected = 0;
-
-	if (!adreno_dev->long_ib_detect)
-		long_ib_detected = 0;
-
-	if (is_adreno_rbbm_status_idle(device)) {
-
-		/*
-		 * On A20X if the RPTR != WPTR and the device is idle, then
-		 * the last write to WPTR probably failed to latch so write it
-		 * again
-		 */
-
-		if (adreno_is_a2xx(adreno_dev)) {
-			unsigned int rptr;
-			adreno_regread(device, REG_CP_RB_RPTR, &rptr);
-			if (rptr != adreno_dev->ringbuffer.wptr)
-				adreno_regwrite(device, REG_CP_RB_WPTR,
-					adreno_dev->ringbuffer.wptr);
-		}
-
-		return 0;
-	}
-
-	/*
-	 * Time interval between hang detection should be KGSL_TIMEOUT_PART
-	 * or more, if next hang detection is requested < KGSL_TIMEOUT_PART
-	 * from the last time do nothing.
-	 */
-	if ((next_hang_detect_time) &&
-		(time_before(jiffies, next_hang_detect_time)))
-			return 0;
-	else
-		next_hang_detect_time = (jiffies +
-			msecs_to_jiffies(KGSL_TIMEOUT_PART-1));
-
-	/* Read the current Hang detect reg values here */
-	for (i = 0; i < ft_detect_regs_count; i++) {
-		if (ft_detect_regs[i] == 0)
-			continue;
-		adreno_regread(device, ft_detect_regs[i],
-			&curr_reg_val[i]);
-	}
-
-	/* Read the current global timestamp here */
-	kgsl_sharedmem_readl(&device->memstore,
-			&curr_global_ts,
-			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-			eoptimestamp));
-
-	mb();
-
-	if (curr_global_ts == prev_global_ts) {
-
-		/* Get the current context here */
-		if (context == NULL) {
-			kgsl_sharedmem_readl(&device->memstore,
-				&curr_context_id,
-				KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-				current_context));
-			context = idr_find(&device->context_idr,
-				curr_context_id);
-			if (context != NULL) {
-				curr_context = context->devctxt;
-				curr_context->ib_gpu_time_used = 0;
-			} else {
-				KGSL_DRV_ERR(device,
-					"Fault tolerance no context found\n");
-			}
-		}
-
-		mb();
-
-		if (curr_context != NULL) {
-
-			curr_context->ib_gpu_time_used += KGSL_TIMEOUT_PART;
-			KGSL_FT_INFO(device,
-			"Proc %s used GPU Time %d ms on timestamp 0x%X\n",
-			curr_context->pid_name, curr_context->ib_gpu_time_used,
-			curr_global_ts+1);
-
-			for (i = 0; i < ft_detect_regs_count; i++) {
-				if (curr_reg_val[i] != prev_reg_val[i]) {
-					fast_hang_detected = 0;
-
-					/* Check for long IB here */
-					if ((i >=
-						LONG_IB_DETECT_REG_INDEX_START)
-						&&
-						(i <=
-						LONG_IB_DETECT_REG_INDEX_END))
-						long_ib_detected = 0;
-				}
-			}
-
-			if (fast_hang_detected) {
-				KGSL_FT_ERR(device,
-					"Proc %s, ctxt_id %d ts %d triggered fault tolerance"
-					" on global ts %d\n",
-					curr_context->pid_name, curr_context->id
-					, (kgsl_readtimestamp(device, context,
-					KGSL_TIMESTAMP_RETIRED)+1),
-					curr_global_ts+1);
-				return 1;
-			}
-
-			if ((long_ib_detected) &&
-				(!(curr_context->flags &
-				 CTXT_FLAGS_NO_FAULT_TOLERANCE))) {
-				curr_context->ib_gpu_time_used +=
-					KGSL_TIMEOUT_PART;
-				if (curr_context->ib_gpu_time_used >
-					KGSL_TIMEOUT_LONG_IB_DETECTION) {
-					if (adreno_dev->long_ib_ts !=
-						curr_global_ts) {
-						KGSL_FT_ERR(device,
-						"Proc %s, ctxt_id %d ts %d"
-						"used GPU for %d ms long ib "
-						"detected on global ts %d\n",
-						curr_context->pid_name,
-						curr_context->id,
-						(kgsl_readtimestamp(device,
-						context,
-						KGSL_TIMESTAMP_RETIRED)+1),
-						curr_context->ib_gpu_time_used,
-						curr_global_ts+1);
-						adreno_dev->long_ib = 1;
-						adreno_dev->long_ib_ts =
-								curr_global_ts;
-						curr_context->ib_gpu_time_used =
-								0;
-						return 1;
-					}
-				}
-			}
-		} else {
-			KGSL_FT_ERR(device,
-				"Last context unknown id:%d\n",
-				curr_context_id);
-		}
-	} else {
-		/* GPU is moving forward */
-		prev_global_ts = curr_global_ts;
-		context = NULL;
-		curr_context = NULL;
-		adreno_dev->long_ib = 0;
-		adreno_dev->long_ib_ts = 0;
-	}
-
-
-	/* If hangs are not detected copy the current reg values
-	 * to previous values and return no hang */
-	for (i = 0; i < ft_detect_regs_count; i++)
-			prev_reg_val[i] = curr_reg_val[i];
-	return 0;
-}
-
-/**
- * adreno_handle_hang - Process a hang detected in adreno_waittimestamp
- * @device - pointer to a KGSL device structure
- * @context - pointer to the active KGSL context
- * @timestamp - the timestamp that the process was waiting for
- *
- * Process a possible GPU hang and try fault tolerance from it
- * cleanly
- */
-static int adreno_handle_hang(struct kgsl_device *device,
-	struct kgsl_context *context, unsigned int timestamp)
-{
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int context_id = _get_context_id(context);
-	unsigned int ts_issued;
-	unsigned int rptr;
-
-	/* Do one last check to see if we somehow made it through */
-	if (kgsl_check_timestamp(device, context, timestamp))
-		return 0;
-
-	ts_issued = adreno_context_timestamp(context, &adreno_dev->ringbuffer);
-
-	adreno_regread(device, REG_CP_RB_RPTR, &rptr);
-	mb();
-
-	KGSL_DRV_WARN(device,
-		     "Device hang detected while waiting for timestamp: "
-		     "<%d:0x%x>, last submitted timestamp: <%d:0x%x>, "
-		     "retired timestamp: <%d:0x%x>, wptr: 0x%x, rptr: 0x%x\n",
-		      context_id, timestamp, context_id, ts_issued, context_id,
-			kgsl_readtimestamp(device, context,
-			KGSL_TIMESTAMP_RETIRED),
-		      adreno_dev->ringbuffer.wptr, rptr);
-
-	/* Return 0 after a successful fault tolerance */
-	if (!adreno_dump_and_exec_ft(device))
-		return 0;
-
-	return -ETIMEDOUT;
-}
-
-static int _check_pending_timestamp(struct kgsl_device *device,
-		struct kgsl_context *context, unsigned int timestamp)
-{
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int context_id = _get_context_id(context);
-	unsigned int ts_issued;
-
-	if (context_id == KGSL_CONTEXT_INVALID)
-		return -EINVAL;
-
-	ts_issued = adreno_context_timestamp(context, &adreno_dev->ringbuffer);
-
-	if (timestamp_cmp(timestamp, ts_issued) <= 0)
-		return 0;
-
-	if (context && !context->wait_on_invalid_ts) {
-		KGSL_DRV_ERR(device, "Cannot wait for invalid ts <%d:0x%x>, last issued ts <%d:0x%x>\n",
-			context_id, timestamp, context_id, ts_issued);
-
-			/* Only print this message once */
-			context->wait_on_invalid_ts = true;
-	}
-
-	return -EINVAL;
-}
-
 /**
  * adreno_waittimestamp - sleep while waiting for the specified timestamp
  * @device - pointer to a KGSL device structure
@@ -2949,155 +2099,35 @@ static int _check_pending_timestamp(struct kgsl_device *device,
  * @timestamp - GPU timestamp to wait for
  * @msecs - amount of time to wait (in milliseconds)
  *
- * Wait 'msecs' milliseconds for the specified timestamp to expire. Wake up
- * every KGSL_TIMEOUT_PART milliseconds to check for a device hang and process
- * one if it happened.  Otherwise, spend most of our time in an interruptible
- * wait for the timestamp interrupt to be processed.  This function must be
- * called with the mutex already held.
+ * Wait up to 'msecs' milliseconds for the specified timestamp to expire.
  */
 static int adreno_waittimestamp(struct kgsl_device *device,
-				struct kgsl_context *context,
-				unsigned int timestamp,
-				unsigned int msecs)
+		struct kgsl_context *context,
+		unsigned int timestamp,
+		unsigned int msecs)
 {
-	static unsigned int io_cnt;
-	struct adreno_context *adreno_ctx = context ? context->devctxt : NULL;
-	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-	unsigned int context_id = _get_context_id(context);
-	unsigned int prev_reg_val[ft_detect_regs_count];
-	unsigned int time_elapsed = 0;
-	unsigned int wait;
-	int ts_compare = 1;
-	int io, ret = -ETIMEDOUT;
-
-	if (context_id == KGSL_CONTEXT_INVALID) {
-		KGSL_DRV_WARN(device, "context was detached");
-		return -EINVAL;
-	}
-
-	/*
-	 * Check to see if the requested timestamp is "newer" then the last
-	 * timestamp issued. If it is complain once and return error.  Only
-	 * print the message once per context so that badly behaving
-	 * applications don't spam the logs
-	 */
+	int ret;
+	struct adreno_context *drawctxt;
 
-	if (adreno_ctx && !(adreno_ctx->flags & CTXT_FLAGS_USER_GENERATED_TS)) {
-		if (_check_pending_timestamp(device, context, timestamp))
-			return -EINVAL;
-
-		/* Reset the invalid timestamp flag on a valid wait */
-		context->wait_on_invalid_ts = false;
+	if (context == NULL) {
+		/* If they are doing then complain once */
+		dev_WARN_ONCE(device->dev, 1,
+			"IOCTL_KGSL_DEVICE_WAITTIMESTAMP is deprecated\n");
+		return -EINVAL;
 	}
 
+	/* Return -EINVAL if the context has been detached */
+	if (kgsl_context_detached(context))
+		return -EINVAL;
 
-	/* Clear the registers used for hang detection */
-	memset(prev_reg_val, 0, sizeof(prev_reg_val));
-
-	/*
-	 * On the first time through the loop only wait 100ms.
-	 * this gives enough time for the engine to start moving and oddly
-	 * provides better hang detection results than just going the full
-	 * KGSL_TIMEOUT_PART right off the bat. The exception to this rule
-	 * is if msecs happens to be < 100ms then just use the full timeout
-	 */
-
-	wait = 100;
-
-	do {
-		long status;
-
-		/*
-		 * if the timestamp happens while we're not
-		 * waiting, there's a chance that an interrupt
-		 * will not be generated and thus the timestamp
-		 * work needs to be queued.
-		 */
-
-		if (kgsl_check_timestamp(device, context, timestamp)) {
-			queue_work(device->work_queue, &device->ts_expired_ws);
-			ret = 0;
-			break;
-		}
-
-		/* Check to see if the GPU is hung */
-		if (adreno_ft_detect(device, prev_reg_val)) {
-			ret = adreno_handle_hang(device, context, timestamp);
-			break;
-		}
-
-		/*
-		 * For proper power accounting sometimes we need to call
-		 * io_wait_interruptible_timeout and sometimes we need to call
-		 * plain old wait_interruptible_timeout. We call the regular
-		 * timeout N times out of 100, where N is a number specified by
-		 * the current power level
-		 */
-
-		io_cnt = (io_cnt + 1) % 100;
-		io = (io_cnt < pwr->pwrlevels[pwr->active_pwrlevel].io_fraction)
-			? 0 : 1;
-
-		mutex_unlock(&device->mutex);
-
-		/* Wait for a timestamp event */
-		status = kgsl_wait_event_interruptible_timeout(
-			device->wait_queue,
-			adreno_check_interrupt_timestamp(device, context,
-				timestamp), msecs_to_jiffies(wait), io);
-
-		mutex_lock(&device->mutex);
-
-		/*
-		 * If status is non zero then either the condition was satisfied
-		 * or there was an error.  In either event, this is the end of
-		 * the line for us
-		 */
-
-		if (status != 0) {
-			ret = (status > 0) ? 0 : (int) status;
-			break;
-		}
-		time_elapsed += wait;
-
-
-		/* If user specified timestamps are being used, wait at least
-		 * KGSL_SYNCOBJ_SERVER_TIMEOUT msecs for the user driver to
-		 * issue a IB for a timestamp before checking to see if the
-		 * current timestamp we are waiting for is valid or not
-		 */
-
-		if (ts_compare && (adreno_ctx &&
-			(adreno_ctx->flags & CTXT_FLAGS_USER_GENERATED_TS))) {
-			if (time_elapsed > KGSL_SYNCOBJ_SERVER_TIMEOUT) {
-				ret = _check_pending_timestamp(device, context,
-					timestamp);
-				if (ret)
-					break;
-
-				/* Don't do this check again */
-				ts_compare = 0;
-
-				/*
-				 * Reset the invalid timestamp flag on a valid
-				 * wait
-				 */
-
-				context->wait_on_invalid_ts = false;
-			}
-		}
-
-		/*
-		 * We want to wait the floor of KGSL_TIMEOUT_PART
-		 * and (msecs - time_elapsed).
-		 */
+	ret = adreno_drawctxt_wait(ADRENO_DEVICE(device), context,
+		timestamp, msecs_to_jiffies(msecs));
 
-		if (KGSL_TIMEOUT_PART < (msecs - time_elapsed))
-			wait = KGSL_TIMEOUT_PART;
-		else
-			wait = (msecs - time_elapsed);
+	/* If the context got invalidated then return a specific error */
+	drawctxt = ADRENO_CONTEXT(context);
 
-	} while (!msecs || time_elapsed < msecs);
+	if (drawctxt->state == ADRENO_CONTEXT_STATE_INVALID)
+		ret = -EDEADLK;
 
 	return ret;
 }
@@ -3106,13 +2136,13 @@ static unsigned int adreno_readtimestamp(struct kgsl_device *device,
 		struct kgsl_context *context, enum kgsl_timestamp_type type)
 {
 	unsigned int timestamp = 0;
-	unsigned int context_id = _get_context_id(context);
+	unsigned int id = context ? context->id : KGSL_MEMSTORE_GLOBAL;
 
 	/*
-	 * If the context ID is invalid, we are in a race with
+	 * If the context is detached we are in a race with
 	 * the context being destroyed by userspace so bail.
 	 */
-	if (context_id == KGSL_CONTEXT_INVALID) {
+	if (context && kgsl_context_detached(context)) {
 		KGSL_DRV_WARN(device, "context was detached");
 		return timestamp;
 	}
@@ -3126,11 +2156,11 @@ static unsigned int adreno_readtimestamp(struct kgsl_device *device,
 	}
 	case KGSL_TIMESTAMP_CONSUMED:
 		kgsl_sharedmem_readl(&device->memstore, &timestamp,
-			KGSL_MEMSTORE_OFFSET(context_id, soptimestamp));
+			KGSL_MEMSTORE_OFFSET(id, soptimestamp));
 		break;
 	case KGSL_TIMESTAMP_RETIRED:
 		kgsl_sharedmem_readl(&device->memstore, &timestamp,
-			KGSL_MEMSTORE_OFFSET(context_id, eoptimestamp));
+			KGSL_MEMSTORE_OFFSET(id, eoptimestamp));
 		break;
 	}
 
@@ -3142,30 +2172,58 @@ static unsigned int adreno_readtimestamp(struct kgsl_device *device,
 static long adreno_ioctl(struct kgsl_device_private *dev_priv,
 			      unsigned int cmd, void *data)
 {
+	struct kgsl_device *device = dev_priv->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	int result = 0;
-	struct kgsl_drawctxt_set_bin_base_offset *binbase;
-	struct kgsl_context *context;
 
 	switch (cmd) {
-	case IOCTL_KGSL_DRAWCTXT_SET_BIN_BASE_OFFSET:
+	case IOCTL_KGSL_DRAWCTXT_SET_BIN_BASE_OFFSET: {
+		struct kgsl_drawctxt_set_bin_base_offset *binbase = data;
+		struct kgsl_context *context;
+
 		binbase = data;
 
 		context = kgsl_context_get_owner(dev_priv,
 			binbase->drawctxt_id);
 		if (context) {
 			adreno_drawctxt_set_bin_base_offset(
-				dev_priv->device, context, binbase->offset);
+				device, context, binbase->offset);
 		} else {
 			result = -EINVAL;
-			KGSL_DRV_ERR(dev_priv->device,
+			KGSL_DRV_ERR(device,
 				"invalid drawctxt drawctxt_id %d "
 				"device_id=%d\n",
-				binbase->drawctxt_id, dev_priv->device->id);
+				binbase->drawctxt_id, device->id);
 		}
 
 		kgsl_context_put(context);
 		break;
-
+	}
+	case IOCTL_KGSL_PERFCOUNTER_GET: {
+		struct kgsl_perfcounter_get *get = data;
+		result = adreno_perfcounter_get(adreno_dev, get->groupid,
+			get->countable, &get->offset, PERFCOUNTER_FLAG_NONE);
+		break;
+	}
+	case IOCTL_KGSL_PERFCOUNTER_PUT: {
+		struct kgsl_perfcounter_put *put = data;
+		result = adreno_perfcounter_put(adreno_dev, put->groupid,
+			put->countable);
+		break;
+	}
+	case IOCTL_KGSL_PERFCOUNTER_QUERY: {
+		struct kgsl_perfcounter_query *query = data;
+		result = adreno_perfcounter_query_group(adreno_dev,
+			query->groupid, query->countables,
+			query->count, &query->max_counters);
+		break;
+	}
+	case IOCTL_KGSL_PERFCOUNTER_READ: {
+		struct kgsl_perfcounter_read *read = data;
+		result = adreno_perfcounter_read_group(adreno_dev,
+			read->reads, read->count);
+		break;
+	}
 	default:
 		KGSL_DRV_INFO(dev_priv->device,
 			"invalid ioctl code %08x\n", cmd);
@@ -3187,15 +2245,20 @@ static void adreno_power_stats(struct kgsl_device *device,
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
-	unsigned int cycles;
-
-	/* Get the busy cycles counted since the counter was last reset */
-	/* Calling this function also resets and restarts the counter */
+	unsigned int cycles = 0;
 
-	cycles = adreno_dev->gpudev->busy_cycles(adreno_dev);
+	/*
+	 * Get the busy cycles counted since the counter was last reset.
+	 * If we're not currently active, there shouldn't have been
+	 * any cycles since the last time this function was called.
+	 */
+	if (device->state == KGSL_STATE_ACTIVE)
+		cycles = adreno_dev->gpudev->busy_cycles(adreno_dev);
 
-	/* In order to calculate idle you have to have run the algorithm *
-	 * at least once to get a start time. */
+	/*
+	 * In order to calculate idle you have to have run the algorithm
+	 * at least once to get a start time.
+	 */
 	if (pwr->time != 0) {
 		s64 tmp = ktime_to_us(ktime_get());
 		stats->total_time = tmp - pwr->time;
@@ -3242,6 +2305,7 @@ static const struct kgsl_functable adreno_functable = {
 	.idle = adreno_idle,
 	.isidle = adreno_isidle,
 	.suspend_context = adreno_suspend_context,
+	.init = adreno_init,
 	.start = adreno_start,
 	.stop = adreno_stop,
 	.getproperty = adreno_getproperty,
@@ -3256,13 +2320,15 @@ static const struct kgsl_functable adreno_functable = {
 	.gpuid = adreno_gpuid,
 	.snapshot = adreno_snapshot,
 	.irq_handler = adreno_irq_handler,
+	.drain = adreno_drain,
 	/* Optional functions */
 	.setstate = adreno_setstate,
 	.drawctxt_create = adreno_drawctxt_create,
+	.drawctxt_detach = adreno_drawctxt_detach,
 	.drawctxt_destroy = adreno_drawctxt_destroy,
 	.setproperty = adreno_setproperty,
 	.postmortem_dump = adreno_dump,
-	.next_event = adreno_next_event,
+	.drawctxt_sched = adreno_drawctxt_sched,
 };
 
 static struct platform_driver adreno_platform_driver = {
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index e7ad20c20bbb8f43fa7916e12d7a60482d5329f6..25d1fdd3ad3803b66348cdef44ec27450be0b548 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -25,17 +25,20 @@
 #define ADRENO_DEVICE(device) \
 		KGSL_CONTAINER_OF(device, struct adreno_device, dev)
 
+#define ADRENO_CONTEXT(device) \
+		KGSL_CONTAINER_OF(device, struct adreno_context, base)
+
 #define ADRENO_CHIPID_CORE(_id) (((_id) >> 24) & 0xFF)
 #define ADRENO_CHIPID_MAJOR(_id) (((_id) >> 16) & 0xFF)
 #define ADRENO_CHIPID_MINOR(_id) (((_id) >> 8) & 0xFF)
 #define ADRENO_CHIPID_PATCH(_id) ((_id) & 0xFF)
 
 /* Flags to control command packet settings */
-#define KGSL_CMD_FLAGS_NONE             0x00000000
-#define KGSL_CMD_FLAGS_PMODE		0x00000001
-#define KGSL_CMD_FLAGS_INTERNAL_ISSUE	0x00000002
-#define KGSL_CMD_FLAGS_GET_INT		0x00000004
-#define KGSL_CMD_FLAGS_EOF	        0x00000100
+#define KGSL_CMD_FLAGS_NONE             0
+#define KGSL_CMD_FLAGS_PMODE		BIT(0)
+#define KGSL_CMD_FLAGS_INTERNAL_ISSUE   BIT(1)
+#define KGSL_CMD_FLAGS_GET_INT		BIT(2)
+#define KGSL_CMD_FLAGS_WFI              BIT(3)
 
 /* Command identifiers */
 #define KGSL_CONTEXT_TO_MEM_IDENTIFIER	0x2EADBEEF
@@ -78,6 +81,47 @@ enum adreno_gpurev {
 	ADRENO_REV_A330 = 330,
 };
 
+/*
+ * Maximum size of the dispatcher ringbuffer - the actual inflight size will be
+ * smaller then this but this size will allow for a larger range of inflight
+ * sizes that can be chosen at runtime
+ */
+
+#define ADRENO_DISPATCH_CMDQUEUE_SIZE 128
+
+/**
+ * struct adreno_dispatcher - container for the adreno GPU dispatcher
+ * @mutex: Mutex to protect the structure
+ * @state: Current state of the dispatcher (active or paused)
+ * @timer: Timer to monitor the progress of the command batches
+ * @inflight: Number of command batch operations pending in the ringbuffer
+ * @fault: True if a HW fault was detected
+ * @pending: Priority list of contexts waiting to submit command batches
+ * @plist_lock: Spin lock to protect the pending queue
+ * @cmdqueue: Queue of command batches currently flight
+ * @head: pointer to the head of of the cmdqueue.  This is the oldest pending
+ * operation
+ * @tail: pointer to the tail of the cmdqueue.  This is the most recently
+ * submitted operation
+ * @work: work_struct to put the dispatcher in a work queue
+ * @kobj: kobject for the dispatcher directory in the device sysfs node
+ */
+struct adreno_dispatcher {
+	struct mutex mutex;
+	unsigned int state;
+	struct timer_list timer;
+	struct timer_list fault_timer;
+	unsigned int inflight;
+	int fault;
+	struct plist_head pending;
+	spinlock_t plist_lock;
+	struct kgsl_cmdbatch *cmdqueue[ADRENO_DISPATCH_CMDQUEUE_SIZE];
+	unsigned int head;
+	unsigned int tail;
+	struct work_struct work;
+	struct kobject kobj;
+};
+
 struct adreno_gpudev;
 
 struct adreno_device {
@@ -105,7 +149,6 @@ struct adreno_device {
 	unsigned int ib_check_level;
 	unsigned int fast_hang_detect;
 	unsigned int ft_policy;
-	unsigned int ft_user_control;
 	unsigned int long_ib_detect;
 	unsigned int long_ib;
 	unsigned int long_ib_ts;
@@ -113,6 +156,46 @@ struct adreno_device {
 	unsigned int gpulist_index;
 	struct ocmem_buf *ocmem_hdl;
 	unsigned int ocmem_base;
+	unsigned int gpu_cycles;
+	struct adreno_dispatcher dispatcher;
+};
+
+#define PERFCOUNTER_FLAG_NONE 0x0
+#define PERFCOUNTER_FLAG_KERNEL 0x1
+
+/* Structs to maintain the list of active performance counters */
+
+/**
+ * struct adreno_perfcount_register: register state
+ * @countable: countable the register holds
+ * @refcount: number of users of the register
+ * @offset: register hardware offset
+ */
+struct adreno_perfcount_register {
+	unsigned int countable;
+	unsigned int refcount;
+	unsigned int offset;
+	unsigned int flags;
+};
+
+/**
+ * struct adreno_perfcount_group: registers for a hardware group
+ * @regs: available registers for this group
+ * @reg_count: total registers for this group
+ */
+struct adreno_perfcount_group {
+	struct adreno_perfcount_register *regs;
+	unsigned int reg_count;
+};
+
+/**
+ * adreno_perfcounts: all available perfcounter groups
+ * @groups: available groups for this device
+ * @group_count: total groups for this device
+ */
+struct adreno_perfcounters {
+	struct adreno_perfcount_group *groups;
+	unsigned int group_count;
 };
 
 struct adreno_gpudev {
@@ -126,60 +209,50 @@ struct adreno_gpudev {
 	/* keeps track of when we need to execute the draw workaround code */
 	int ctx_switches_since_last_draw;
 
+	struct adreno_perfcounters *perfcounters;
+
 	/* GPU specific function hooks */
 	int (*ctxt_create)(struct adreno_device *, struct adreno_context *);
-	void (*ctxt_save)(struct adreno_device *, struct adreno_context *);
-	void (*ctxt_restore)(struct adreno_device *, struct adreno_context *);
-	void (*ctxt_draw_workaround)(struct adreno_device *,
+	int (*ctxt_save)(struct adreno_device *, struct adreno_context *);
+	int (*ctxt_restore)(struct adreno_device *, struct adreno_context *);
+	int (*ctxt_draw_workaround)(struct adreno_device *,
 					struct adreno_context *);
 	irqreturn_t (*irq_handler)(struct adreno_device *);
 	void (*irq_control)(struct adreno_device *, int);
 	unsigned int (*irq_pending)(struct adreno_device *);
 	void * (*snapshot)(struct adreno_device *, void *, int *, int);
-	void (*rb_init)(struct adreno_device *, struct adreno_ringbuffer *);
+	int (*rb_init)(struct adreno_device *, struct adreno_ringbuffer *);
+	void (*perfcounter_init)(struct adreno_device *);
 	void (*start)(struct adreno_device *);
 	unsigned int (*busy_cycles)(struct adreno_device *);
+	void (*perfcounter_enable)(struct adreno_device *, unsigned int group,
+		unsigned int counter, unsigned int countable);
+	uint64_t (*perfcounter_read)(struct adreno_device *adreno_dev,
+		unsigned int group, unsigned int counter,
+		unsigned int offset);
 };
 
-/*
- * struct adreno_ft_data - Structure that contains all information to
- * perform gpu fault tolerance
- * @ib1 - IB1 that the GPU was executing when hang happened
- * @context_id - Context which caused the hang
- * @global_eop - eoptimestamp at time of hang
- * @rb_buffer - Buffer that holds the commands from good contexts
- * @rb_size - Number of valid dwords in rb_buffer
- * @bad_rb_buffer - Buffer that holds commands from the hanging context
- * bad_rb_size - Number of valid dwords in bad_rb_buffer
- * @good_rb_buffer - Buffer that holds commands from good contexts
- * good_rb_size - Number of valid dwords in good_rb_buffer
- * @last_valid_ctx_id - The last context from which commands were placed in
- * ringbuffer before the GPU hung
- * @step - Current fault tolerance step being executed
- * @err_code - Fault tolerance error code
- * @fault - Indicates whether the hang was caused due to a pagefault
- * @start_of_replay_cmds - Offset in ringbuffer from where commands can be
- * replayed during fault tolerance
- * @replay_for_snapshot - Offset in ringbuffer where IB's can be saved for
- * replaying with snapshot
- */
-struct adreno_ft_data {
-	unsigned int ib1;
-	unsigned int context_id;
-	unsigned int global_eop;
-	unsigned int *rb_buffer;
-	unsigned int rb_size;
-	unsigned int *bad_rb_buffer;
-	unsigned int bad_rb_size;
-	unsigned int *good_rb_buffer;
-	unsigned int good_rb_size;
-	unsigned int last_valid_ctx_id;
-	unsigned int status;
-	unsigned int ft_policy;
-	unsigned int err_code;
-	unsigned int start_of_replay_cmds;
-	unsigned int replay_for_snapshot;
-};
+#define FT_DETECT_REGS_COUNT 12
+
+/* Fault Tolerance policy flags */
+#define  KGSL_FT_OFF                      BIT(0)
+#define  KGSL_FT_REPLAY                   BIT(1)
+#define  KGSL_FT_SKIPIB                   BIT(2)
+#define  KGSL_FT_SKIPFRAME                BIT(3)
+#define  KGSL_FT_DISABLE                  BIT(4)
+#define  KGSL_FT_TEMP_DISABLE             BIT(5)
+#define  KGSL_FT_DEFAULT_POLICY           (KGSL_FT_REPLAY + KGSL_FT_SKIPIB)
+
+/* This internal bit is used to skip the PM dump on replayed command batches */
+#define  KGSL_FT_SKIP_PMDUMP              BIT(31)
+
+/* Pagefault policy flags */
+#define KGSL_FT_PAGEFAULT_INT_ENABLE         BIT(0)
+#define KGSL_FT_PAGEFAULT_GPUHALT_ENABLE     BIT(1)
+#define KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE   BIT(2)
+#define KGSL_FT_PAGEFAULT_LOG_ONE_PER_INT    BIT(3)
+#define KGSL_FT_PAGEFAULT_DEFAULT_POLICY     (KGSL_FT_PAGEFAULT_INT_ENABLE + \
+					KGSL_FT_PAGEFAULT_GPUHALT_ENABLE)
 
 extern struct adreno_gpudev adreno_a2xx_gpudev;
 extern struct adreno_gpudev adreno_a3xx_gpudev;
@@ -203,7 +276,6 @@ extern const unsigned int a330_registers[];
 extern const unsigned int a330_registers_count;
 
 extern unsigned int ft_detect_regs[];
-extern const unsigned int ft_detect_regs_count;
 
 
 int adreno_idle(struct kgsl_device *device);
@@ -213,6 +285,8 @@ void adreno_regwrite(struct kgsl_device *device, unsigned int offsetwords,
 				unsigned int value);
 
 int adreno_dump(struct kgsl_device *device, int manual);
+unsigned int adreno_a3xx_rbbm_clock_ctl_default(struct adreno_device
+							*adreno_dev);
 
 struct kgsl_memdesc *adreno_find_region(struct kgsl_device *device,
 						unsigned int pt_base,
@@ -228,13 +302,30 @@ struct kgsl_memdesc *adreno_find_ctxtmem(struct kgsl_device *device,
 void *adreno_snapshot(struct kgsl_device *device, void *snapshot, int *remain,
 		int hang);
 
-int adreno_dump_and_exec_ft(struct kgsl_device *device);
+void adreno_dispatcher_start(struct adreno_device *adreno_dev);
+int adreno_dispatcher_init(struct adreno_device *adreno_dev);
+void adreno_dispatcher_close(struct adreno_device *adreno_dev);
+int adreno_dispatcher_idle(struct adreno_device *adreno_dev,
+		unsigned int timeout);
+void adreno_dispatcher_irq_fault(struct kgsl_device *device);
+void adreno_dispatcher_stop(struct adreno_device *adreno_dev);
+
+int adreno_context_queue_cmd(struct adreno_device *adreno_dev,
+		struct adreno_context *drawctxt, struct kgsl_cmdbatch *cmdbatch,
+		uint32_t *timestamp);
+
+void adreno_dispatcher_schedule(struct kgsl_device *device);
+void adreno_dispatcher_pause(struct adreno_device *adreno_dev);
+void adreno_dispatcher_queue_context(struct kgsl_device *device,
+	struct adreno_context *drawctxt);
+int adreno_reset(struct kgsl_device *device);
 
-void adreno_dump_rb(struct kgsl_device *device, const void *buf,
-			 size_t len, int start, int size);
+int adreno_perfcounter_get(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable, unsigned int *offset,
+	unsigned int flags);
 
-unsigned int adreno_ft_detect(struct kgsl_device *device,
-						unsigned int *prev_reg_val);
+int adreno_perfcounter_put(struct adreno_device *adreno_dev,
+	unsigned int groupid, unsigned int countable);
 
 static inline int adreno_is_a200(struct adreno_device *adreno_dev)
 {
@@ -297,23 +388,33 @@ static inline int adreno_is_a330(struct adreno_device *adreno_dev)
 	return (adreno_dev->gpurev == ADRENO_REV_A330);
 }
 
+static inline int adreno_is_a330v2(struct adreno_device *adreno_dev)
+{
+	return ((adreno_dev->gpurev == ADRENO_REV_A330) &&
+		(ADRENO_CHIPID_PATCH(adreno_dev->chip_id) > 0));
+}
+
 static inline int adreno_rb_ctxtswitch(unsigned int *cmd)
 {
 	return (cmd[0] == cp_nop_packet(1) &&
 		cmd[1] == KGSL_CONTEXT_TO_MEM_IDENTIFIER);
 }
 
+/**
+ * adreno_context_timestamp() - Return the last queued timestamp for the context
+ * @k_ctxt: Pointer to the KGSL context to query
+ * @rb: Pointer to the ringbuffer structure for the GPU
+ *
+ * Return the last queued context for the given context. This is used to verify
+ * that incoming requests are not using an invalid (unsubmitted) timestamp
+ */
 static inline int adreno_context_timestamp(struct kgsl_context *k_ctxt,
 		struct adreno_ringbuffer *rb)
 {
-	struct adreno_context *a_ctxt = NULL;
-
-	if (k_ctxt)
-		a_ctxt = k_ctxt->devctxt;
-
-	if (a_ctxt && a_ctxt->flags & CTXT_FLAGS_PER_CONTEXT_TS)
+	if (k_ctxt) {
+		struct adreno_context *a_ctxt = ADRENO_CONTEXT(k_ctxt);
 		return a_ctxt->timestamp;
-
+	}
 	return rb->global_ts;
 }
 
diff --git a/drivers/gpu/msm/adreno_a2xx.c b/drivers/gpu/msm/adreno_a2xx.c
index f3ebe0158f564f8302801907e7087d6d35723088..93068c05c86fcdb50a2fff07a39d4eef63134684 100644
--- a/drivers/gpu/msm/adreno_a2xx.c
+++ b/drivers/gpu/msm/adreno_a2xx.c
@@ -1355,7 +1355,7 @@ static int a2xx_create_gmem_shadow(struct adreno_device *adreno_dev,
 	tmp_ctx.gmem_base = adreno_dev->gmem_base;
 
 	result = kgsl_allocate(&drawctxt->context_gmem_shadow.gmemshadow,
-		drawctxt->pagetable, drawctxt->context_gmem_shadow.size);
+		drawctxt->base.pagetable, drawctxt->context_gmem_shadow.size);
 
 	if (result)
 		return result;
@@ -1365,7 +1365,7 @@ static int a2xx_create_gmem_shadow(struct adreno_device *adreno_dev,
 
 	/* blank out gmem shadow. */
 	kgsl_sharedmem_set(&drawctxt->context_gmem_shadow.gmemshadow, 0, 0,
-			   drawctxt->context_gmem_shadow.size);
+			drawctxt->context_gmem_shadow.size);
 
 	/* build quad vertex buffer */
 	build_quad_vtxbuff(drawctxt, &drawctxt->context_gmem_shadow,
@@ -1409,13 +1409,13 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev,
 	 */
 
 	ret = kgsl_allocate(&drawctxt->gpustate,
-		drawctxt->pagetable, _context_size(adreno_dev));
+		drawctxt->base.pagetable, _context_size(adreno_dev));
 
 	if (ret)
 		return ret;
 
-	kgsl_sharedmem_set(&drawctxt->gpustate, 0, 0,
-		_context_size(adreno_dev));
+	kgsl_sharedmem_set(&drawctxt->gpustate,
+		0, 0, _context_size(adreno_dev));
 
 	tmp_ctx.cmd = tmp_ctx.start
 	    = (unsigned int *)((char *)drawctxt->gpustate.hostptr + CMD_OFFSET);
@@ -1439,8 +1439,8 @@ static int a2xx_drawctxt_create(struct adreno_device *adreno_dev,
 	kgsl_cache_range_op(&drawctxt->gpustate,
 			    KGSL_CACHE_OP_FLUSH);
 
-	kgsl_cffdump_syncmem(NULL, &drawctxt->gpustate,
-			drawctxt->gpustate.gpuaddr,
+	kgsl_cffdump_syncmem(NULL,
+			&drawctxt->gpustate, drawctxt->gpustate.gpuaddr,
 			drawctxt->gpustate.size, false);
 
 done:
@@ -1450,7 +1450,7 @@ done:
 	return ret;
 }
 
-static void a2xx_drawctxt_draw_workaround(struct adreno_device *adreno_dev,
+static int a2xx_drawctxt_draw_workaround(struct adreno_device *adreno_dev,
 					struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
@@ -1467,7 +1467,7 @@ static void a2xx_drawctxt_draw_workaround(struct adreno_device *adreno_dev,
 				ADRENO_NUM_CTX_SWITCH_ALLOWED_BEFORE_DRAW)
 			adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
 		else
-			return;
+			return 0;
 		/*
 		 * Issue an empty draw call to avoid possible hangs due to
 		 * repeated idles without intervening draw calls.
@@ -1498,138 +1498,201 @@ static void a2xx_drawctxt_draw_workaround(struct adreno_device *adreno_dev,
 					| adreno_dev->pix_shader_start;
 	}
 
-	adreno_ringbuffer_issuecmds(device, context, KGSL_CMD_FLAGS_PMODE,
-			&cmd[0], cmds - cmd);
+	return adreno_ringbuffer_issuecmds(device, context,
+			KGSL_CMD_FLAGS_PMODE, &cmd[0], cmds - cmd);
 }
 
-static void a2xx_drawctxt_save(struct adreno_device *adreno_dev,
+static int a2xx_drawctxt_save(struct adreno_device *adreno_dev,
 			struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
+	int ret;
 
 	if (context == NULL || (context->flags & CTXT_FLAGS_BEING_DESTROYED))
-		return;
+		return 0;
 
-	if (context->flags & CTXT_FLAGS_GPU_HANG)
-		KGSL_CTXT_WARN(device,
-			"Current active context has caused gpu hang\n");
+	if (context->state == ADRENO_CONTEXT_STATE_INVALID)
+		return 0;
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
-
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->reg_save[1],
+			context->reg_save[2] << 2, true);
 		/* save registers and constants. */
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->reg_save, 3);
 
+		if (ret)
+			return ret;
+
 		if (context->flags & CTXT_FLAGS_SHADER_SAVE) {
+			kgsl_cffdump_syncmem(NULL,
+				&context->gpustate,
+				context->shader_save[1],
+				context->shader_save[2] << 2, true);
 			/* save shader partitioning and instructions. */
-			adreno_ringbuffer_issuecmds(device, context,
+			ret = adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_PMODE,
 				context->shader_save, 3);
 
+			kgsl_cffdump_syncmem(NULL,
+				&context->gpustate,
+				context->shader_fixup[1],
+				context->shader_fixup[2] << 2, true);
 			/*
 			 * fixup shader partitioning parameter for
 			 *  SET_SHADER_BASES.
 			 */
-			adreno_ringbuffer_issuecmds(device, context,
+			ret = adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->shader_fixup, 3);
 
+			if (ret)
+				return ret;
+
 			context->flags |= CTXT_FLAGS_SHADER_RESTORE;
 		}
 	}
 
 	if ((context->flags & CTXT_FLAGS_GMEM_SAVE) &&
 	    (context->flags & CTXT_FLAGS_GMEM_SHADOW)) {
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->context_gmem_shadow.gmem_save[1],
+			context->context_gmem_shadow.gmem_save[2] << 2, true);
 		/* save gmem.
 		 * (note: changes shader. shader must already be saved.)
 		 */
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_PMODE,
 			context->context_gmem_shadow.gmem_save, 3);
 
+		if (ret)
+			return ret;
+
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->chicken_restore[1],
+			context->chicken_restore[2] << 2, true);
+
 		/* Restore TP0_CHICKEN */
 		if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
-			adreno_ringbuffer_issuecmds(device, context,
+			ret = adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->chicken_restore, 3);
+
+			if (ret)
+				return ret;
 		}
 		adreno_dev->gpudev->ctx_switches_since_last_draw = 0;
 
 		context->flags |= CTXT_FLAGS_GMEM_RESTORE;
 	} else if (adreno_is_a2xx(adreno_dev))
-		a2xx_drawctxt_draw_workaround(adreno_dev, context);
+		return a2xx_drawctxt_draw_workaround(adreno_dev, context);
+
+	return 0;
 }
 
-static void a2xx_drawctxt_restore(struct adreno_device *adreno_dev,
+static int a2xx_drawctxt_restore(struct adreno_device *adreno_dev,
 			struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 	unsigned int cmds[5];
+	int ret = 0;
 
 	if (context == NULL) {
-		/* No context - set the default apgetable and thats it */
+		/* No context - set the default pagetable and thats it */
+		unsigned int id;
+		/*
+		 * If there isn't a current context, the kgsl_mmu_setstate
+		 * will use the CPU path so we don't need to give
+		 * it a valid context id.
+		 */
+		id = (adreno_dev->drawctxt_active != NULL)
+			? adreno_dev->drawctxt_active->base.id
+			: KGSL_CONTEXT_INVALID;
 		kgsl_mmu_setstate(&device->mmu, device->mmu.defaultpagetable,
-				adreno_dev->drawctxt_active->id);
-		return;
+				  id);
+		return 0;
 	}
 
-	KGSL_CTXT_INFO(device, "context flags %08x\n", context->flags);
-
 	cmds[0] = cp_nop_packet(1);
 	cmds[1] = KGSL_CONTEXT_TO_MEM_IDENTIFIER;
 	cmds[2] = cp_type3_packet(CP_MEM_WRITE, 2);
 	cmds[3] = device->memstore.gpuaddr +
 		KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL, current_context);
-	cmds[4] = context->id;
-	adreno_ringbuffer_issuecmds(device, context, KGSL_CMD_FLAGS_NONE,
+	cmds[4] = context->base.id;
+	ret = adreno_ringbuffer_issuecmds(device, context, KGSL_CMD_FLAGS_NONE,
 					cmds, 5);
-	kgsl_mmu_setstate(&device->mmu, context->pagetable, context->id);
+	if (ret)
+		return ret;
 
-#ifndef CONFIG_MSM_KGSL_CFF_DUMP_NO_CONTEXT_MEM_DUMP
-	kgsl_cffdump_syncmem(NULL, &context->gpustate,
-		context->gpustate.gpuaddr, LCC_SHADOW_SIZE +
-		REG_SHADOW_SIZE + CMD_BUFFER_SIZE + TEX_SHADOW_SIZE, false);
-#endif
+	kgsl_mmu_setstate(&device->mmu, context->base.pagetable,
+			context->base.id);
 
 	/* restore gmem.
 	 *  (note: changes shader. shader must not already be restored.)
 	 */
 	if (context->flags & CTXT_FLAGS_GMEM_RESTORE) {
-		adreno_ringbuffer_issuecmds(device, context,
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->context_gmem_shadow.gmem_restore[1],
+			context->context_gmem_shadow.gmem_restore[2] << 2,
+			true);
+
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_PMODE,
 			context->context_gmem_shadow.gmem_restore, 3);
+		if (ret)
+			return ret;
 
 		if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
+			kgsl_cffdump_syncmem(NULL, &context->gpustate,
+				context->chicken_restore[1],
+				context->chicken_restore[2] << 2, true);
+
 			/* Restore TP0_CHICKEN */
-			adreno_ringbuffer_issuecmds(device, context,
+			ret = adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->chicken_restore, 3);
+			if (ret)
+				return ret;
 		}
 
 		context->flags &= ~CTXT_FLAGS_GMEM_RESTORE;
 	}
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
+		kgsl_cffdump_syncmem(NULL, &context->gpustate,
+			context->reg_restore[1],
+			context->reg_restore[2] << 2, true);
 
 		/* restore registers and constants. */
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE, context->reg_restore, 3);
+		if (ret)
+			return ret;
 
 		/* restore shader instructions & partitioning. */
 		if (context->flags & CTXT_FLAGS_SHADER_RESTORE) {
-			adreno_ringbuffer_issuecmds(device, context,
+			kgsl_cffdump_syncmem(NULL, &context->gpustate,
+				context->shader_restore[1],
+				context->shader_restore[2] << 2, true);
+
+			ret = adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->shader_restore, 3);
+			if (ret)
+				return ret;
 		}
 	}
 
 	if (adreno_is_a20x(adreno_dev)) {
 		cmds[0] = cp_type3_packet(CP_SET_BIN_BASE_OFFSET, 1);
 		cmds[1] = context->bin_base_offset;
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE, cmds, 2);
 	}
+
+	return ret;
 }
 
 /*
@@ -1696,13 +1759,14 @@ static void a2xx_cp_intrcallback(struct kgsl_device *device)
 
 	if (!status) {
 		if (master_status & MASTER_INT_SIGNAL__CP_INT_STAT) {
-			/* This indicates that we could not read CP_INT_STAT.
-			 * As a precaution just wake up processes so
-			 * they can check their timestamps. Since, we
-			 * did not ack any interrupts this interrupt will
-			 * be generated again */
+			/*
+			 * This indicates that we could not read CP_INT_STAT.
+			 * As a precaution schedule the dispatcher to check
+			 * things out. Since we did not ack any interrupts this
+			 * interrupt will be generated again
+			 */
 			KGSL_DRV_WARN(device, "Unable to read CP_INT_STATUS\n");
-			wake_up_interruptible_all(&device->wait_queue);
+			adreno_dispatcher_schedule(device);
 		} else
 			KGSL_DRV_WARN(device, "Spurious interrput detected\n");
 		return;
@@ -1727,9 +1791,8 @@ static void a2xx_cp_intrcallback(struct kgsl_device *device)
 	adreno_regwrite(device, REG_CP_INT_ACK, status);
 
 	if (status & (CP_INT_CNTL__IB1_INT_MASK | CP_INT_CNTL__RB_INT_MASK)) {
-		KGSL_CMD_WARN(rb->device, "ringbuffer ib1/rb interrupt\n");
 		queue_work(device->work_queue, &device->ts_expired_ws);
-		wake_up_interruptible_all(&device->wait_queue);
+		adreno_dispatcher_schedule(device);
 	}
 }
 
@@ -1828,13 +1891,16 @@ static unsigned int a2xx_irq_pending(struct adreno_device *adreno_dev)
 		(mh & kgsl_mmu_get_int_mask())) ? 1 : 0;
 }
 
-static void a2xx_rb_init(struct adreno_device *adreno_dev,
+static int a2xx_rb_init(struct adreno_device *adreno_dev,
 			struct adreno_ringbuffer *rb)
 {
 	unsigned int *cmds, cmds_gpu;
 
 	/* ME_INIT */
 	cmds = adreno_ringbuffer_allocspace(rb, NULL, 19);
+	if (cmds == NULL)
+		return -ENOMEM;
+
 	cmds_gpu = rb->buffer_desc.gpuaddr + sizeof(uint)*(rb->wptr-19);
 
 	GSL_RB_WRITE(cmds, cmds_gpu, cp_type3_packet(CP_ME_INIT, 18));
@@ -1887,6 +1953,8 @@ static void a2xx_rb_init(struct adreno_device *adreno_dev,
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 
 	adreno_ringbuffer_submit(rb);
+
+	return 0;
 }
 
 static unsigned int a2xx_busy_cycles(struct adreno_device *adreno_dev)
diff --git a/drivers/gpu/msm/adreno_a3xx.c b/drivers/gpu/msm/adreno_a3xx.c
index 019a6c78a8eca4201524b6614e2a635e8a8296be..1e61279e10f70fb9ceca583d23926f4fb6f02fa7 100644
--- a/drivers/gpu/msm/adreno_a3xx.c
+++ b/drivers/gpu/msm/adreno_a3xx.c
@@ -445,6 +445,21 @@ static void build_regconstantsave_cmds(struct adreno_device *adreno_dev,
 	tmp_ctx.cmd = cmd;
 }
 
+unsigned int adreno_a3xx_rbbm_clock_ctl_default(struct adreno_device
+							*adreno_dev)
+{
+	if (adreno_is_a305(adreno_dev))
+		return A305_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a320(adreno_dev))
+		return A320_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a330v2(adreno_dev))
+		return A330v2_RBBM_CLOCK_CTL_DEFAULT;
+	else if (adreno_is_a330(adreno_dev))
+		return A330_RBBM_CLOCK_CTL_DEFAULT;
+
+	BUG_ON(1);
+}
+
 /* Copy GMEM contents to system memory shadow. */
 static unsigned int *build_gmem2sys_cmds(struct adreno_device *adreno_dev,
 					 struct adreno_context *drawctxt,
@@ -454,7 +469,7 @@ static unsigned int *build_gmem2sys_cmds(struct adreno_device *adreno_dev,
 	unsigned int *start = cmds;
 
 	*cmds++ = cp_type0_packet(A3XX_RBBM_CLOCK_CTL, 1);
-	*cmds++ = A3XX_RBBM_CLOCK_CTL_DEFAULT;
+	*cmds++ = adreno_a3xx_rbbm_clock_ctl_default(adreno_dev);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 3);
 	*cmds++ = CP_REG(A3XX_RB_MODE_CONTROL);
@@ -1250,7 +1265,7 @@ static unsigned int *build_sys2gmem_cmds(struct adreno_device *adreno_dev,
 	unsigned int *start = cmds;
 
 	*cmds++ = cp_type0_packet(A3XX_RBBM_CLOCK_CTL, 1);
-	*cmds++ = A3XX_RBBM_CLOCK_CTL_DEFAULT;
+	*cmds++ = adreno_a3xx_rbbm_clock_ctl_default(adreno_dev);
 
 	*cmds++ = cp_type3_packet(CP_SET_CONSTANT, 5);
 	*cmds++ = CP_REG(A3XX_HLSQ_CONTROL_0_REG);
@@ -2302,7 +2317,7 @@ static int a3xx_create_gmem_shadow(struct adreno_device *adreno_dev,
 	tmp_ctx.gmem_base = adreno_dev->gmem_base;
 
 	result = kgsl_allocate(&drawctxt->context_gmem_shadow.gmemshadow,
-		drawctxt->pagetable, drawctxt->context_gmem_shadow.size);
+		drawctxt->base.pagetable, drawctxt->context_gmem_shadow.size);
 
 	if (result)
 		return result;
@@ -2336,7 +2351,7 @@ static int a3xx_drawctxt_create(struct adreno_device *adreno_dev,
 	 */
 
 	ret = kgsl_allocate(&drawctxt->gpustate,
-		drawctxt->pagetable, CONTEXT_SIZE);
+		drawctxt->base.pagetable, CONTEXT_SIZE);
 
 	if (ret)
 		return ret;
@@ -2362,32 +2377,38 @@ done:
 	return ret;
 }
 
-static void a3xx_drawctxt_save(struct adreno_device *adreno_dev,
+static int a3xx_drawctxt_save(struct adreno_device *adreno_dev,
 			   struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
+	int ret;
 
 	if (context == NULL || (context->flags & CTXT_FLAGS_BEING_DESTROYED))
-		return;
+		return 0;
 
-	if (context->flags & CTXT_FLAGS_GPU_HANG)
-		KGSL_CTXT_WARN(device,
-			       "Current active context has caused gpu hang\n");
+	if (context->state == ADRENO_CONTEXT_STATE_INVALID)
+		return 0;
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
 		/* Fixup self modifying IBs for save operations */
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE, context->save_fixup, 3);
+		if (ret)
+			return ret;
 
 		/* save registers and constants. */
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->regconstant_save, 3);
+		if (ret)
+			return ret;
 
 		if (context->flags & CTXT_FLAGS_SHADER_SAVE) {
 			/* Save shader instructions */
-			adreno_ringbuffer_issuecmds(device, context,
+			ret = adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_PMODE, context->shader_save, 3);
+			if (ret)
+				return ret;
 
 			context->flags |= CTXT_FLAGS_SHADER_RESTORE;
 		}
@@ -2400,38 +2421,60 @@ static void a3xx_drawctxt_save(struct adreno_device *adreno_dev,
 		 * already be saved.)
 		 */
 
-		adreno_ringbuffer_issuecmds(device, context,
+		kgsl_cffdump_syncmem(context->base.device,
+			&context->gpustate,
+			context->context_gmem_shadow.gmem_save[1],
+			context->context_gmem_shadow.gmem_save[2] << 2, true);
+
+		ret = adreno_ringbuffer_issuecmds(device, context,
 					KGSL_CMD_FLAGS_PMODE,
 					    context->context_gmem_shadow.
 					    gmem_save, 3);
+		if (ret)
+			return ret;
+
 		context->flags |= CTXT_FLAGS_GMEM_RESTORE;
 	}
+
+	return 0;
 }
 
-static void a3xx_drawctxt_restore(struct adreno_device *adreno_dev,
+static int a3xx_drawctxt_restore(struct adreno_device *adreno_dev,
 			      struct adreno_context *context)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 	unsigned int cmds[5];
+	int ret = 0;
 
 	if (context == NULL) {
 		/* No context - set the default pagetable and thats it */
+		unsigned int id;
+		/*
+		 * If there isn't a current context, the kgsl_mmu_setstate
+		 * will use the CPU path so we don't need to give
+		 * it a valid context id.
+		 */
+		id = (adreno_dev->drawctxt_active != NULL)
+			? adreno_dev->drawctxt_active->base.id
+			: KGSL_CONTEXT_INVALID;
 		kgsl_mmu_setstate(&device->mmu, device->mmu.defaultpagetable,
-				adreno_dev->drawctxt_active->id);
-		return;
+				  id);
+		return 0;
 	}
 
-	KGSL_CTXT_INFO(device, "context flags %08x\n", context->flags);
-
 	cmds[0] = cp_nop_packet(1);
 	cmds[1] = KGSL_CONTEXT_TO_MEM_IDENTIFIER;
 	cmds[2] = cp_type3_packet(CP_MEM_WRITE, 2);
 	cmds[3] = device->memstore.gpuaddr +
 		KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL, current_context);
-	cmds[4] = context->id;
-	adreno_ringbuffer_issuecmds(device, context, KGSL_CMD_FLAGS_NONE,
+	cmds[4] = context->base.id;
+	ret = adreno_ringbuffer_issuecmds(device, context, KGSL_CMD_FLAGS_NONE,
 					cmds, 5);
-	kgsl_mmu_setstate(&device->mmu, context->pagetable, context->id);
+	if (ret)
+		return ret;
+
+	kgsl_mmu_setstate(&device->mmu, context->base.pagetable,
+			context->base.id);
 
 	/*
 	 * Restore GMEM.  (note: changes shader.
@@ -2439,43 +2482,63 @@ static void a3xx_drawctxt_restore(struct adreno_device *adreno_dev,
 	 */
 
 	if (context->flags & CTXT_FLAGS_GMEM_RESTORE) {
-		adreno_ringbuffer_issuecmds(device, context,
+		kgsl_cffdump_syncmem(NULL,
+			&context->gpustate,
+			context->context_gmem_shadow.gmem_restore[1],
+			context->context_gmem_shadow.gmem_restore[2] << 2,
+			true);
+
+		ret = adreno_ringbuffer_issuecmds(device, context,
 					KGSL_CMD_FLAGS_PMODE,
 					    context->context_gmem_shadow.
 					    gmem_restore, 3);
+		if (ret)
+			return ret;
 		context->flags &= ~CTXT_FLAGS_GMEM_RESTORE;
 	}
 
 	if (!(context->flags & CTXT_FLAGS_PREAMBLE)) {
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE, context->reg_restore, 3);
+		if (ret)
+			return ret;
 
 		/* Fixup self modifying IBs for restore operations */
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->restore_fixup, 3);
+		if (ret)
+			return ret;
 
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->constant_restore, 3);
+		if (ret)
+			return ret;
 
 		if (context->flags & CTXT_FLAGS_SHADER_RESTORE)
-			adreno_ringbuffer_issuecmds(device, context,
+			ret = adreno_ringbuffer_issuecmds(device, context,
 				KGSL_CMD_FLAGS_NONE,
 				context->shader_restore, 3);
-
+			if (ret)
+				return ret;
 		/* Restore HLSQ_CONTROL_0 register */
-		adreno_ringbuffer_issuecmds(device, context,
+		ret = adreno_ringbuffer_issuecmds(device, context,
 			KGSL_CMD_FLAGS_NONE,
 			context->hlsqcontrol_restore, 3);
 	}
+
+	return ret;
 }
 
-static void a3xx_rb_init(struct adreno_device *adreno_dev,
+static int a3xx_rb_init(struct adreno_device *adreno_dev,
 			 struct adreno_ringbuffer *rb)
 {
 	unsigned int *cmds, cmds_gpu;
 	cmds = adreno_ringbuffer_allocspace(rb, NULL, 18);
+	if (cmds == NULL)
+		return -ENOMEM;
+
 	cmds_gpu = rb->buffer_desc.gpuaddr + sizeof(uint) * (rb->wptr - 18);
 
 	GSL_RB_WRITE(cmds, cmds_gpu, cp_type3_packet(CP_ME_INIT, 17));
@@ -2499,6 +2562,8 @@ static void a3xx_rb_init(struct adreno_device *adreno_dev,
 	GSL_RB_WRITE(cmds, cmds_gpu, 0x00000000);
 
 	adreno_ringbuffer_submit(rb);
+
+	return 0;
 }
 
 static void a3xx_err_callback(struct adreno_device *adreno_dev, int bit)
@@ -2525,6 +2590,9 @@ static void a3xx_err_callback(struct adreno_device *adreno_dev, int bit)
 
 		/* Clear the error */
 		adreno_regwrite(device, A3XX_RBBM_AHB_CMD, (1 << 3));
+
+		/* Trigger a fault in the interrupt handler */
+		adreno_dispatcher_irq_fault(device);
 		return;
 	}
 	case A3XX_INT_RBBM_REG_TIMEOUT:
@@ -2566,8 +2634,13 @@ static void a3xx_err_callback(struct adreno_device *adreno_dev, int bit)
 	case A3XX_INT_UCHE_OOB_ACCESS:
 		err = "UCHE:  Out of bounds access";
 		break;
+	default:
+		return;
 	}
 
+	/* Trigger a fault in the dispatcher - this will effect a restart */
+	adreno_dispatcher_irq_fault(device);
+
 	KGSL_DRV_CRIT(device, "%s\n", err);
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
 }
@@ -2576,11 +2649,276 @@ static void a3xx_cp_callback(struct adreno_device *adreno_dev, int irq)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 
-	/* Wake up everybody waiting for the interrupt */
-	wake_up_interruptible_all(&device->wait_queue);
-
-	/* Schedule work to free mem and issue ibs */
+	/* Schedule the event queue */
 	queue_work(device->work_queue, &device->ts_expired_ws);
+
+	adreno_dispatcher_schedule(device);
+}
+
+/**
+ * struct a3xx_perfcounter_register - Define a performance counter register
+ * @load_bit: the bit to set in RBBM_LOAD_CMD0/RBBM_LOAD_CMD1 to force the RBBM
+ * to load the reset value into the appropriate counter
+ * @select: The dword offset of the register to write the selected
+ * countable into
+ */
+
+struct a3xx_perfcounter_register {
+	unsigned int load_bit;
+	unsigned int select;
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_cp[] = {
+	{ 0, A3XX_CP_PERFCOUNTER_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_rbbm[] = {
+	{ 1, A3XX_RBBM_PERFCOUNTER0_SELECT },
+	{ 2, A3XX_RBBM_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_pc[] = {
+	{ 3, A3XX_PC_PERFCOUNTER0_SELECT },
+	{ 4, A3XX_PC_PERFCOUNTER1_SELECT },
+	{ 5, A3XX_PC_PERFCOUNTER2_SELECT },
+	{ 6, A3XX_PC_PERFCOUNTER3_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_vfd[] = {
+	{ 7, A3XX_VFD_PERFCOUNTER0_SELECT },
+	{ 8, A3XX_VFD_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_hlsq[] = {
+	{ 9, A3XX_HLSQ_PERFCOUNTER0_SELECT },
+	{ 10, A3XX_HLSQ_PERFCOUNTER1_SELECT },
+	{ 11, A3XX_HLSQ_PERFCOUNTER2_SELECT },
+	{ 12, A3XX_HLSQ_PERFCOUNTER3_SELECT },
+	{ 13, A3XX_HLSQ_PERFCOUNTER4_SELECT },
+	{ 14, A3XX_HLSQ_PERFCOUNTER5_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_vpc[] = {
+	{ 15, A3XX_VPC_PERFCOUNTER0_SELECT },
+	{ 16, A3XX_VPC_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_tse[] = {
+	{ 17, A3XX_GRAS_PERFCOUNTER0_SELECT },
+	{ 18, A3XX_GRAS_PERFCOUNTER1_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_ras[] = {
+	{ 19, A3XX_GRAS_PERFCOUNTER2_SELECT },
+	{ 20, A3XX_GRAS_PERFCOUNTER3_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_uche[] = {
+	{ 21, A3XX_UCHE_PERFCOUNTER0_SELECT },
+	{ 22, A3XX_UCHE_PERFCOUNTER1_SELECT },
+	{ 23, A3XX_UCHE_PERFCOUNTER2_SELECT },
+	{ 24, A3XX_UCHE_PERFCOUNTER3_SELECT },
+	{ 25, A3XX_UCHE_PERFCOUNTER4_SELECT },
+	{ 26, A3XX_UCHE_PERFCOUNTER5_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_tp[] = {
+	{ 27, A3XX_TP_PERFCOUNTER0_SELECT },
+	{ 28, A3XX_TP_PERFCOUNTER1_SELECT },
+	{ 29, A3XX_TP_PERFCOUNTER2_SELECT },
+	{ 30, A3XX_TP_PERFCOUNTER3_SELECT },
+	{ 31, A3XX_TP_PERFCOUNTER4_SELECT },
+	{ 32, A3XX_TP_PERFCOUNTER5_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_sp[] = {
+	{ 33, A3XX_SP_PERFCOUNTER0_SELECT },
+	{ 34, A3XX_SP_PERFCOUNTER1_SELECT },
+	{ 35, A3XX_SP_PERFCOUNTER2_SELECT },
+	{ 36, A3XX_SP_PERFCOUNTER3_SELECT },
+	{ 37, A3XX_SP_PERFCOUNTER4_SELECT },
+	{ 38, A3XX_SP_PERFCOUNTER5_SELECT },
+	{ 39, A3XX_SP_PERFCOUNTER6_SELECT },
+	{ 40, A3XX_SP_PERFCOUNTER7_SELECT },
+};
+
+static struct a3xx_perfcounter_register a3xx_perfcounter_reg_rb[] = {
+	{ 41, A3XX_RB_PERFCOUNTER0_SELECT },
+	{ 42, A3XX_RB_PERFCOUNTER1_SELECT },
+};
+
+#define REGCOUNTER_GROUP(_x) { (_x), ARRAY_SIZE((_x)) }
+
+static struct {
+	struct a3xx_perfcounter_register *regs;
+	int count;
+} a3xx_perfcounter_reglist[] = {
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_cp),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_rbbm),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_pc),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_vfd),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_hlsq),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_vpc),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_tse),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_ras),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_uche),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_tp),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_sp),
+	REGCOUNTER_GROUP(a3xx_perfcounter_reg_rb),
+};
+
+static void a3xx_perfcounter_enable_pwr(struct kgsl_device *device,
+	unsigned int countable)
+{
+	unsigned int in, out;
+
+	adreno_regread(device, A3XX_RBBM_RBBM_CTL, &in);
+
+	if (countable == 0)
+		out = in | RBBM_RBBM_CTL_RESET_PWR_CTR0;
+	else
+		out = in | RBBM_RBBM_CTL_RESET_PWR_CTR1;
+
+	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, out);
+
+	if (countable == 0)
+		out = in | RBBM_RBBM_CTL_ENABLE_PWR_CTR0;
+	else
+		out = in | RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
+
+	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, out);
+
+	return;
+}
+
+static void a3xx_perfcounter_enable_vbif(struct kgsl_device *device,
+					 unsigned int counter,
+					 unsigned int countable)
+{
+	unsigned int in, out, bit, sel;
+
+	if (counter > 1 || countable > 0x7f)
+		return;
+
+	adreno_regread(device, A3XX_VBIF_PERF_CNT_EN, &in);
+	adreno_regread(device, A3XX_VBIF_PERF_CNT_SEL, &sel);
+
+	if (counter == 0) {
+		bit = VBIF_PERF_CNT_0;
+		sel = (sel & ~VBIF_PERF_CNT_0_SEL_MASK) | countable;
+	} else {
+		bit = VBIF_PERF_CNT_1;
+		sel = (sel & ~VBIF_PERF_CNT_1_SEL_MASK)
+			| (countable << VBIF_PERF_CNT_1_SEL);
+	}
+
+	out = in | bit;
+
+	adreno_regwrite(device, A3XX_VBIF_PERF_CNT_SEL, sel);
+
+	adreno_regwrite(device, A3XX_VBIF_PERF_CNT_CLR, bit);
+	adreno_regwrite(device, A3XX_VBIF_PERF_CNT_CLR, 0);
+
+	adreno_regwrite(device, A3XX_VBIF_PERF_CNT_EN, out);
+}
+
+static void a3xx_perfcounter_enable_vbif_pwr(struct kgsl_device *device,
+					     unsigned int countable)
+{
+	unsigned int in, out, bit;
+
+	adreno_regread(device, A3XX_VBIF_PERF_CNT_EN, &in);
+	if (countable == 0)
+		bit = VBIF_PERF_PWR_CNT_0;
+	else if (countable == 1)
+		bit = VBIF_PERF_PWR_CNT_1;
+	else
+		bit = VBIF_PERF_PWR_CNT_2;
+
+	out = in | bit;
+
+	adreno_regwrite(device, A3XX_VBIF_PERF_CNT_CLR, bit);
+	adreno_regwrite(device, A3XX_VBIF_PERF_CNT_CLR, 0);
+
+	adreno_regwrite(device, A3XX_VBIF_PERF_CNT_EN, out);
+}
+
+/*
+ * a3xx_perfcounter_enable - Configure a performance counter for a countable
+ * @adreno_dev -  Adreno device to configure
+ * @group - Desired performance counter group
+ * @counter - Desired performance counter in the group
+ * @countable - Desired countable
+ *
+ * Physically set up a counter within a group with the desired countable
+ */
+
+static void a3xx_perfcounter_enable(struct adreno_device *adreno_dev,
+	unsigned int group, unsigned int counter, unsigned int countable)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	unsigned int val = 0;
+	struct a3xx_perfcounter_register *reg;
+
+	if (group >= ARRAY_SIZE(a3xx_perfcounter_reglist))
+		return;
+
+	if (counter >= a3xx_perfcounter_reglist[group].count)
+		return;
+
+	/* Special cases */
+	if (group == KGSL_PERFCOUNTER_GROUP_PWR)
+		return a3xx_perfcounter_enable_pwr(device, countable);
+	else if (group == KGSL_PERFCOUNTER_GROUP_VBIF)
+		return a3xx_perfcounter_enable_vbif(device, counter, countable);
+	else if (group == KGSL_PERFCOUNTER_GROUP_VBIF_PWR)
+		return a3xx_perfcounter_enable_vbif_pwr(device, countable);
+
+	reg = &(a3xx_perfcounter_reglist[group].regs[counter]);
+
+	/* Select the desired perfcounter */
+	adreno_regwrite(device, reg->select, countable);
+
+	if (reg->load_bit < 32) {
+		val = 1 << reg->load_bit;
+		adreno_regwrite(device, A3XX_RBBM_PERFCTR_LOAD_CMD0, val);
+	} else {
+		val  = 1 << (reg->load_bit - 32);
+		adreno_regwrite(device, A3XX_RBBM_PERFCTR_LOAD_CMD1, val);
+	}
+}
+
+static uint64_t a3xx_perfcounter_read(struct adreno_device *adreno_dev,
+	unsigned int group, unsigned int counter,
+	unsigned int offset)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct a3xx_perfcounter_register *reg = NULL;
+	unsigned int lo = 0, hi = 0;
+	unsigned int val;
+
+	if (group >= ARRAY_SIZE(a3xx_perfcounter_reglist))
+		return 0;
+
+	if (counter >= a3xx_perfcounter_reglist[group].count)
+		return 0;
+
+	reg = &(a3xx_perfcounter_reglist[group].regs[counter]);
+
+	/* Freeze the counter */
+	adreno_regread(device, A3XX_RBBM_PERFCTR_CTL, &val);
+	val &= ~reg->load_bit;
+	adreno_regwrite(device, A3XX_RBBM_PERFCTR_CTL, val);
+
+	/* Read the values */
+	adreno_regread(device, offset, &lo);
+	adreno_regread(device, offset + 1, &hi);
+
+	/* Re-Enable the counter */
+	val |= reg->load_bit;
+	adreno_regwrite(device, A3XX_RBBM_PERFCTR_CTL, val);
+
+	return (((uint64_t) hi) << 32) | lo;
 }
 
 #define A3XX_IRQ_CALLBACK(_c) { .func = _c }
@@ -2684,26 +3022,22 @@ static unsigned int a3xx_irq_pending(struct adreno_device *adreno_dev)
 static unsigned int a3xx_busy_cycles(struct adreno_device *adreno_dev)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
-	unsigned int reg, val;
-
-	/* Freeze the counter */
-	adreno_regread(device, A3XX_RBBM_RBBM_CTL, &reg);
-	reg &= ~RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
-	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
+	unsigned int val;
+	unsigned int ret = 0;
 
 	/* Read the value */
 	adreno_regread(device, A3XX_RBBM_PERFCTR_PWR_1_LO, &val);
 
-	/* Reset the counter */
-	reg |= RBBM_RBBM_CTL_RESET_PWR_CTR1;
-	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
-
-	/* Re-enable the counter */
-	reg &= ~RBBM_RBBM_CTL_RESET_PWR_CTR1;
-	reg |= RBBM_RBBM_CTL_ENABLE_PWR_CTR1;
-	adreno_regwrite(device, A3XX_RBBM_RBBM_CTL, reg);
+	/* Return 0 for the first read */
+	if (adreno_dev->gpu_cycles != 0) {
+		if (val < adreno_dev->gpu_cycles)
+			ret = (0xFFFFFFFF - adreno_dev->gpu_cycles) + val;
+		else
+			ret = val - adreno_dev->gpu_cycles;
+	}
 
-	return val;
+	adreno_dev->gpu_cycles = val;
+	return ret;
 }
 
 struct a3xx_vbif_data {
@@ -2781,17 +3115,83 @@ static struct a3xx_vbif_data a330_vbif[] = {
 	{0, 0},
 };
 
+/*
+ * Most of the VBIF registers on 8974v2 have the correct values at power on, so
+ * we won't modify those if we don't need to
+ */
+static struct a3xx_vbif_data a330v2_vbif[] = {
+	/* Enable 1k sort */
+	{ A3XX_VBIF_ABIT_SORT, 0x0001003F },
+	{ A3XX_VBIF_ABIT_SORT_CONF, 0x000000A4 },
+	/* Enable WR-REQ */
+	{ A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x00003F },
+	{ A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303 },
+	/* Set up VBIF_ROUND_ROBIN_QOS_ARB */
+	{ A3XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x0003 },
+	/* Disable VBIF clock gating. This is to enable AXI running
+	 * higher frequency than GPU.
+	 */
+	{ A3XX_VBIF_CLKON, 1 },
+	{0, 0},
+};
+
+static struct {
+	int(*devfunc)(struct adreno_device *);
+	struct a3xx_vbif_data *vbif;
+} a3xx_vbif_platforms[] = {
+	{ adreno_is_a305, a305_vbif },
+	{ adreno_is_a320, a320_vbif },
+	/* A330v2 needs to be ahead of A330 so the right device matches */
+	{ adreno_is_a330v2, a330v2_vbif },
+	{ adreno_is_a330, a330_vbif },
+};
+
+static void a3xx_perfcounter_init(struct adreno_device *adreno_dev)
+{
+	/*
+	 * Set SP to count SP_ALU_ACTIVE_CYCLES, it includes
+	 * all ALU instruction execution regardless precision or shader ID.
+	 * Set SP to count SP0_ICL1_MISSES, It counts
+	 * USP L1 instruction miss request.
+	 * Set SP to count SP_FS_FULL_ALU_INSTRUCTIONS, it
+	 * counts USP flow control instruction execution.
+	 * we will use this to augment our hang detection
+	 */
+	if (adreno_dev->fast_hang_detect) {
+		adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+			SP_ALU_ACTIVE_CYCLES, &ft_detect_regs[6],
+			PERFCOUNTER_FLAG_KERNEL);
+		ft_detect_regs[7] = ft_detect_regs[6] + 1;
+		adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+			SP0_ICL1_MISSES, &ft_detect_regs[8],
+			PERFCOUNTER_FLAG_KERNEL);
+		ft_detect_regs[9] = ft_detect_regs[8] + 1;
+		adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+			SP_FS_CFLOW_INSTRUCTIONS, &ft_detect_regs[10],
+			PERFCOUNTER_FLAG_KERNEL);
+		ft_detect_regs[11] = ft_detect_regs[10] + 1;
+	}
+
+	adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_SP,
+		SP_FS_FULL_ALU_INSTRUCTIONS, NULL, PERFCOUNTER_FLAG_KERNEL);
+
+	/* Reserve and start countable 1 in the PWR perfcounter group */
+	adreno_perfcounter_get(adreno_dev, KGSL_PERFCOUNTER_GROUP_PWR, 1,
+			NULL, PERFCOUNTER_FLAG_KERNEL);
+}
+
 static void a3xx_start(struct adreno_device *adreno_dev)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
 	struct a3xx_vbif_data *vbif = NULL;
+	int i;
 
-	if (adreno_is_a305(adreno_dev))
-		vbif = a305_vbif;
-	else if (adreno_is_a320(adreno_dev))
-		vbif = a320_vbif;
-	else if (adreno_is_a330(adreno_dev))
-		vbif = a330_vbif;
+	for (i = 0; i < ARRAY_SIZE(a3xx_vbif_platforms); i++) {
+		if (a3xx_vbif_platforms[i].devfunc(adreno_dev)) {
+			vbif = a3xx_vbif_platforms[i].vbif;
+			break;
+		}
+	}
 
 	BUG_ON(vbif == NULL);
 
@@ -2829,7 +3229,14 @@ static void a3xx_start(struct adreno_device *adreno_dev)
 
 	/* Enable Clock gating */
 	adreno_regwrite(device, A3XX_RBBM_CLOCK_CTL,
-			A3XX_RBBM_CLOCK_CTL_DEFAULT);
+		adreno_a3xx_rbbm_clock_ctl_default(adreno_dev));
+
+	if (adreno_is_a330v2(adreno_dev))
+		adreno_regwrite(device, A3XX_RBBM_GPR0_CTL,
+			A330v2_RBBM_GPR0_CTL_DEFAULT);
+	else if (adreno_is_a330(adreno_dev))
+		adreno_regwrite(device, A3XX_RBBM_GPR0_CTL,
+			A330_RBBM_GPR0_CTL_DEFAULT);
 
 	/* Set the OCMEM base address for A330 */
 	if (adreno_is_a330(adreno_dev)) {
@@ -2840,25 +3247,133 @@ static void a3xx_start(struct adreno_device *adreno_dev)
 	/* Turn on performance counters */
 	adreno_regwrite(device, A3XX_RBBM_PERFCTR_CTL, 0x01);
 
-	/*
-	 * Set SP perfcounter 5 to count SP_ALU_ACTIVE_CYCLES, it includes
-	 * all ALU instruction execution regardless precision or shader ID.
-	 * Set SP perfcounter 6 to count SP0_ICL1_MISSES, It counts
-	 * USP L1 instruction miss request.
-	 * Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS, it
-	 * counts USP flow control instruction execution.
-	 * we will use this to augment our hang detection
-	 */
-	if (adreno_dev->fast_hang_detect) {
-		adreno_regwrite(device, A3XX_SP_PERFCOUNTER5_SELECT,
-			SP_ALU_ACTIVE_CYCLES);
-		adreno_regwrite(device, A3XX_SP_PERFCOUNTER6_SELECT,
-			SP0_ICL1_MISSES);
-		adreno_regwrite(device, A3XX_SP_PERFCOUNTER7_SELECT,
-			SP_FS_CFLOW_INSTRUCTIONS);
-	}
+	/* Turn on the GPU busy counter and let it run free */
+
+	adreno_dev->gpu_cycles = 0;
 }
 
+/*
+ * Define the available perfcounter groups - these get used by
+ * adreno_perfcounter_get and adreno_perfcounter_put
+ */
+
+static struct adreno_perfcount_register a3xx_perfcounters_cp[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_CP_0_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_rbbm[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RBBM_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RBBM_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_pc[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PC_3_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_vfd[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VFD_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VFD_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_hlsq[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_HLSQ_5_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_vpc[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VPC_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_VPC_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_tse[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TSE_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TSE_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_ras[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RAS_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RAS_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_uche[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_UCHE_5_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_tp[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_TP_5_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_sp[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_1_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_2_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_3_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_4_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_5_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_6_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_SP_7_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_rb[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RB_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_RB_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_pwr[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PWR_0_LO, 0 },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_RBBM_PERFCTR_PWR_1_LO, 0 },
+};
+
+static struct adreno_perfcount_register a3xx_perfcounters_vbif[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_VBIF_PERF_CNT0_LO },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_VBIF_PERF_CNT1_LO },
+};
+static struct adreno_perfcount_register a3xx_perfcounters_vbif_pwr[] = {
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_VBIF_PERF_PWR_CNT0_LO },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_VBIF_PERF_PWR_CNT1_LO },
+	{ KGSL_PERFCOUNTER_NOT_USED, 0, A3XX_VBIF_PERF_PWR_CNT2_LO },
+};
+
+static struct adreno_perfcount_group a3xx_perfcounter_groups[] = {
+	{ a3xx_perfcounters_cp, ARRAY_SIZE(a3xx_perfcounters_cp) },
+	{ a3xx_perfcounters_rbbm, ARRAY_SIZE(a3xx_perfcounters_rbbm) },
+	{ a3xx_perfcounters_pc, ARRAY_SIZE(a3xx_perfcounters_pc) },
+	{ a3xx_perfcounters_vfd, ARRAY_SIZE(a3xx_perfcounters_vfd) },
+	{ a3xx_perfcounters_hlsq, ARRAY_SIZE(a3xx_perfcounters_hlsq) },
+	{ a3xx_perfcounters_vpc, ARRAY_SIZE(a3xx_perfcounters_vpc) },
+	{ a3xx_perfcounters_tse, ARRAY_SIZE(a3xx_perfcounters_tse) },
+	{ a3xx_perfcounters_ras, ARRAY_SIZE(a3xx_perfcounters_ras) },
+	{ a3xx_perfcounters_uche, ARRAY_SIZE(a3xx_perfcounters_uche) },
+	{ a3xx_perfcounters_tp, ARRAY_SIZE(a3xx_perfcounters_tp) },
+	{ a3xx_perfcounters_sp, ARRAY_SIZE(a3xx_perfcounters_sp) },
+	{ a3xx_perfcounters_rb, ARRAY_SIZE(a3xx_perfcounters_rb) },
+	{ a3xx_perfcounters_pwr, ARRAY_SIZE(a3xx_perfcounters_pwr) },
+	{ a3xx_perfcounters_vbif, ARRAY_SIZE(a3xx_perfcounters_vbif) },
+	{ a3xx_perfcounters_vbif_pwr, ARRAY_SIZE(a3xx_perfcounters_vbif_pwr) },
+};
+
+static struct adreno_perfcounters a3xx_perfcounters = {
+	a3xx_perfcounter_groups,
+	ARRAY_SIZE(a3xx_perfcounter_groups),
+};
+
 /* Defined in adreno_a3xx_snapshot.c */
 void *a3xx_snapshot(struct adreno_device *adreno_dev, void *snapshot,
 	int *remain, int hang);
@@ -2867,16 +3382,20 @@ struct adreno_gpudev adreno_a3xx_gpudev = {
 	.reg_rbbm_status = A3XX_RBBM_STATUS,
 	.reg_cp_pfp_ucode_addr = A3XX_CP_PFP_UCODE_ADDR,
 	.reg_cp_pfp_ucode_data = A3XX_CP_PFP_UCODE_DATA,
+	.perfcounters = &a3xx_perfcounters,
 
 	.ctxt_create = a3xx_drawctxt_create,
 	.ctxt_save = a3xx_drawctxt_save,
 	.ctxt_restore = a3xx_drawctxt_restore,
 	.ctxt_draw_workaround = NULL,
 	.rb_init = a3xx_rb_init,
+	.perfcounter_init = a3xx_perfcounter_init,
 	.irq_control = a3xx_irq_control,
 	.irq_handler = a3xx_irq_handler,
 	.irq_pending = a3xx_irq_pending,
 	.busy_cycles = a3xx_busy_cycles,
 	.start = a3xx_start,
 	.snapshot = a3xx_snapshot,
+	.perfcounter_enable = a3xx_perfcounter_enable,
+	.perfcounter_read = a3xx_perfcounter_read,
 };
diff --git a/drivers/gpu/msm/adreno_a3xx_snapshot.c b/drivers/gpu/msm/adreno_a3xx_snapshot.c
index d9d5ec8286d8c1dbd65fc58427e5ee7eb6bc7626..34cac7944479aad4c2fdd9fa928e15fccb52c754 100644
--- a/drivers/gpu/msm/adreno_a3xx_snapshot.c
+++ b/drivers/gpu/msm/adreno_a3xx_snapshot.c
@@ -21,6 +21,22 @@
 
 #define SHADER_MEMORY_SIZE 0x4000
 
+/**
+ * _rbbm_debug_bus_read - Helper function to read data from the RBBM
+ * debug bus.
+ * @device - GPU device to read/write registers
+ * @block_id - Debug bus block to read from
+ * @index - Index in the debug bus block to read
+ * @ret - Value of the register read
+ */
+static void _rbbm_debug_bus_read(struct kgsl_device *device,
+	unsigned int block_id, unsigned int index, unsigned int *val)
+{
+	unsigned int block = (block_id << 8) | 1 << 16;
+	adreno_regwrite(device, A3XX_RBBM_DEBUG_BUS_CTL, block | index);
+	adreno_regread(device, A3XX_RBBM_DEBUG_BUS_DATA_STATUS, val);
+}
+
 static int a3xx_snapshot_shader_memory(struct kgsl_device *device,
 	void *snapshot, int remain, void *priv)
 {
@@ -243,11 +259,8 @@ static int a3xx_snapshot_debugbus_block(struct kgsl_device *device,
 	header->id = id;
 	header->count = DEBUGFS_BLOCK_SIZE;
 
-	for (i = 0; i < DEBUGFS_BLOCK_SIZE; i++) {
-		adreno_regwrite(device, A3XX_RBBM_DEBUG_BUS_CTL, val | i);
-		adreno_regread(device, A3XX_RBBM_DEBUG_BUS_DATA_STATUS,
-			&data[i]);
-	}
+	for (i = 0; i < DEBUGFS_BLOCK_SIZE; i++)
+		_rbbm_debug_bus_read(device, id, i, &data[i]);
 
 	return size;
 }
@@ -309,18 +322,58 @@ static void _snapshot_hlsq_regs(struct kgsl_snapshot_registers *regs,
 	struct kgsl_snapshot_registers_list *list,
 	struct adreno_device *adreno_dev)
 {
-	/* HLSQ specific registers */
+	struct kgsl_device *device = &adreno_dev->dev;
+
 	/*
-	 * Don't dump any a3xx HLSQ registers just yet.  Reading the HLSQ
-	 * registers can cause the device to hang if the HLSQ block is
-	 * busy.  Add specific checks for each a3xx core as the requirements
-	 * are discovered.  Disable by default for now.
+	 * Trying to read HLSQ registers when the HLSQ block is busy
+	 * will cause the device to hang.  The RBBM_DEBUG_BUS has information
+	 * that will tell us if the HLSQ block is busy or not.  Read values
+	 * from the debug bus to ensure the HLSQ block is not busy (this
+	 * is hardware dependent).  If the HLSQ block is busy do not
+	 * dump the registers, otherwise dump the HLSQ registers.
 	 */
-	if (!adreno_is_a3xx(adreno_dev)) {
-		regs[list->count].regs = (unsigned int *) a3xx_hlsq_registers;
-		regs[list->count].count = a3xx_hlsq_registers_count;
-		list->count++;
+
+	if (adreno_is_a330(adreno_dev)) {
+		/*
+		 * stall_ctxt_full status bit: RBBM_BLOCK_ID_HLSQ index 49 [27]
+		 *
+		 * if (!stall_context_full)
+		 * then dump HLSQ registers
+		 */
+		unsigned int stall_context_full = 0;
+
+		_rbbm_debug_bus_read(device, RBBM_BLOCK_ID_HLSQ, 49,
+				&stall_context_full);
+		stall_context_full &= 0x08000000;
+
+		if (stall_context_full)
+			return;
+	} else {
+		/*
+		 * tpif status bits: RBBM_BLOCK_ID_HLSQ index 4 [4:0]
+		 * spif status bits: RBBM_BLOCK_ID_HLSQ index 7 [5:0]
+		 *
+		 * if ((tpif == 0, 1, 28) && (spif == 0, 1, 10))
+		 * then dump HLSQ registers
+		 */
+		unsigned int next_pif = 0;
+
+		/* check tpif */
+		_rbbm_debug_bus_read(device, RBBM_BLOCK_ID_HLSQ, 4, &next_pif);
+		next_pif &= 0x1f;
+		if (next_pif != 0 && next_pif != 1 && next_pif != 28)
+			return;
+
+		/* check spif */
+		_rbbm_debug_bus_read(device, RBBM_BLOCK_ID_HLSQ, 7, &next_pif);
+		next_pif &= 0x3f;
+		if (next_pif != 0 && next_pif != 1 && next_pif != 10)
+			return;
 	}
+
+	regs[list->count].regs = (unsigned int *) a3xx_hlsq_registers;
+	regs[list->count].count = a3xx_hlsq_registers_count;
+	list->count++;
 }
 
 static void _snapshot_a330_regs(struct kgsl_snapshot_registers *regs,
@@ -414,7 +467,7 @@ void *a3xx_snapshot(struct adreno_device *adreno_dev, void *snapshot,
 
 	/* Enable Clock gating */
 	adreno_regwrite(device, A3XX_RBBM_CLOCK_CTL,
-			A3XX_RBBM_CLOCK_CTL_DEFAULT);
+		adreno_a3xx_rbbm_clock_ctl_default(adreno_dev));
 
 	return snapshot;
 }
diff --git a/drivers/gpu/msm/adreno_debugfs.c b/drivers/gpu/msm/adreno_debugfs.c
index 890c8a1805919018a3c8097c07c3dfb21785f86f..e6e4d769d7962bcfdb2fa2de5dd3c8b7b7abf496 100644
--- a/drivers/gpu/msm/adreno_debugfs.c
+++ b/drivers/gpu/msm/adreno_debugfs.c
@@ -43,6 +43,17 @@ static int kgsl_cff_dump_enable_get(void *data, u64 *val)
 DEFINE_SIMPLE_ATTRIBUTE(kgsl_cff_dump_enable_fops, kgsl_cff_dump_enable_get,
 			kgsl_cff_dump_enable_set, "%llu\n");
 
+static int _active_count_get(void *data, u64 *val)
+{
+	struct kgsl_device *device = data;
+	unsigned int i = atomic_read(&device->active_cnt);
+
+	*val = (u64) i;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(_active_count_fops, _active_count_get, NULL, "%llu\n");
+
 typedef void (*reg_read_init_t)(struct kgsl_device *device);
 typedef void (*reg_read_fill_t)(struct kgsl_device *device, int i,
 	unsigned int *vals, int linec);
@@ -64,23 +75,18 @@ void adreno_debugfs_init(struct kgsl_device *device)
 	adreno_dev->fast_hang_detect = 1;
 	debugfs_create_u32("fast_hang_detect", 0644, device->d_debugfs,
 			   &adreno_dev->fast_hang_detect);
-
-	/* Top level switch to enable/disable userspace FT control */
-	adreno_dev->ft_user_control = 0;
-	debugfs_create_u32("ft_user_control", 0644, device->d_debugfs,
-			   &adreno_dev->ft_user_control);
 	/*
 	 * FT policy can be set to any of the options below.
-	 * KGSL_FT_DISABLE -> BIT(0) Set to disable FT
+	 * KGSL_FT_OFF -> BIT(0) Set to turn off FT
 	 * KGSL_FT_REPLAY  -> BIT(1) Set to enable replay
 	 * KGSL_FT_SKIPIB  -> BIT(2) Set to skip IB
 	 * KGSL_FT_SKIPFRAME -> BIT(3) Set to skip frame
+	 * KGSL_FT_DISABLE -> BIT(4) Set to disable FT for faulting context
 	 * by default set FT policy to KGSL_FT_DEFAULT_POLICY
 	 */
 	adreno_dev->ft_policy = KGSL_FT_DEFAULT_POLICY;
 	debugfs_create_u32("ft_policy", 0644, device->d_debugfs,
 			   &adreno_dev->ft_policy);
-
 	/* By default enable long IB detection */
 	adreno_dev->long_ib_detect = 1;
 	debugfs_create_u32("long_ib_detect", 0644, device->d_debugfs,
@@ -96,7 +102,10 @@ void adreno_debugfs_init(struct kgsl_device *device)
 	 * KGSL_FT_PAGEFAULT_LOG_ONE_PER_INT -> BIT(3) Set to log only one
 	 * pagefault per INT.
 	 */
-	adreno_dev->ft_pf_policy = KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
-	debugfs_create_u32("ft_pagefault_policy", 0644, device->d_debugfs,
-			   &adreno_dev->ft_pf_policy);
+	 adreno_dev->ft_pf_policy = KGSL_FT_PAGEFAULT_DEFAULT_POLICY;
+	 debugfs_create_u32("ft_pagefault_policy", 0644, device->d_debugfs,
+			&adreno_dev->ft_pf_policy);
+
+	debugfs_create_file("active_cnt", 0644, device->d_debugfs, device,
+			    &_active_count_fops);
 }
diff --git a/drivers/gpu/msm/adreno_dispatch.c b/drivers/gpu/msm/adreno_dispatch.c
new file mode 100644
index 0000000000000000000000000000000000000000..72b73b6fa784235716184a3b34e82edcf4537325
--- /dev/null
+++ b/drivers/gpu/msm/adreno_dispatch.c
@@ -0,0 +1,1415 @@
+/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/wait.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/err.h>
+
+#include "kgsl.h"
+#include "adreno.h"
+#include "adreno_ringbuffer.h"
+#include "adreno_trace.h"
+
+#define ADRENO_DISPATCHER_ACTIVE 0
+#define ADRENO_DISPATCHER_PAUSE 1
+
+#define ADRENO_DISPATCHER_SOFT_FAULT 1
+#define ADRENO_DISPATCHER_HARD_FAULT 2
+#define ADRENO_DISPATCHER_TIMEOUT_FAULT 3
+
+#define CMDQUEUE_NEXT(_i, _s) (((_i) + 1) % (_s))
+
+/* Number of commands that can be queued in a context before it sleeps */
+static unsigned int _context_cmdqueue_size = 50;
+
+/* Number of milliseconds to wait for the context queue to clear */
+static unsigned int _context_queue_wait = 10000;
+
+/* Number of command batches sent at a time from a single context */
+static unsigned int _context_cmdbatch_burst = 5;
+
+/* Number of command batches inflight in the ringbuffer at any time */
+static unsigned int _dispatcher_inflight = 15;
+
+/* Command batch timeout (in milliseconds) */
+static unsigned int _cmdbatch_timeout = 2000;
+
+/* Interval for reading and comparing fault detection registers */
+static unsigned int _fault_timer_interval = 100;
+
+/* Local array for the current set of fault detect registers */
+static unsigned int *fault_detect_regs;
+
+/**
+ * fault_detect_read() - Read the set of fault detect registers
+ * @device: Pointer to the KGSL device struct
+ *
+ * Read the set of fault detect registers and store them in the local array.
+ * This is for the initial values that are compared later with
+ * fault_detect_read_compare
+ */
+static void fault_detect_read(struct kgsl_device *device)
+{
+	int i;
+
+	for (i = 0; i < FT_DETECT_REGS_COUNT; i++) {
+		if (ft_detect_regs[i] == 0)
+			continue;
+		adreno_regread(device, ft_detect_regs[i],
+			&fault_detect_regs[i]);
+	}
+}
+
+/**
+ * fault_detect_read_compare() - Read the fault detect registers and compare
+ * them to the current value
+ * @device: Pointer to the KGSL device struct
+ *
+ * Read the set of fault detect registers and compare them to the current set
+ * of registers.  Return 1 if any of the register values changed
+ */
+static int fault_detect_read_compare(struct kgsl_device *device)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < FT_DETECT_REGS_COUNT; i++) {
+		unsigned int val;
+
+		if (ft_detect_regs[i] == 0)
+			continue;
+		adreno_regread(device, ft_detect_regs[i], &val);
+		if (val != fault_detect_regs[i])
+			ret = 1;
+		fault_detect_regs[i] = val;
+	}
+
+	return ret;
+}
+
+/**
+ * adreno_context_get_cmdbatch() - Get a new command from a context queue
+ * @drawctxt: Pointer to the adreno draw context
+ *
+ * Dequeue a new command batch from the context list
+ */
+static inline struct kgsl_cmdbatch *adreno_context_get_cmdbatch(
+		struct adreno_context *drawctxt)
+{
+	struct kgsl_cmdbatch *cmdbatch = NULL;
+
+	mutex_lock(&drawctxt->mutex);
+	if (drawctxt->cmdqueue_head != drawctxt->cmdqueue_tail) {
+		cmdbatch = drawctxt->cmdqueue[drawctxt->cmdqueue_head];
+
+		/*
+		 * Don't dequeue a cmdbatch that is still waiting for other
+		 * events
+		 */
+		if (kgsl_cmdbatch_sync_pending(cmdbatch)) {
+			cmdbatch = ERR_PTR(-EAGAIN);
+			goto done;
+		}
+
+		drawctxt->cmdqueue_head =
+			CMDQUEUE_NEXT(drawctxt->cmdqueue_head,
+			ADRENO_CONTEXT_CMDQUEUE_SIZE);
+		drawctxt->queued--;
+	}
+
+done:
+	mutex_unlock(&drawctxt->mutex);
+
+	return cmdbatch;
+}
+
+/**
+ * adreno_context_requeue_cmdbatch() - Put a command back on the context queue
+ * @drawctxt: Pointer to the adreno draw context
+ * @cmdbatch: Pointer to the KGSL cmdbatch to requeue
+ *
+ * Failure to submit a command to the ringbuffer isn't the fault of the command
+ * being submitted so if a failure happens, push it back on the head of the the
+ * context queue to be reconsidered again
+ */
+static inline void adreno_context_requeue_cmdbatch(
+		struct adreno_context *drawctxt, struct kgsl_cmdbatch *cmdbatch)
+{
+	unsigned int prev;
+	mutex_lock(&drawctxt->mutex);
+
+	prev = drawctxt->cmdqueue_head - 1;
+
+	if (prev < 0)
+		prev = ADRENO_CONTEXT_CMDQUEUE_SIZE - 1;
+
+	/*
+	 * The maximum queue size always needs to be one less then the size of
+	 * the ringbuffer queue so there is "room" to put the cmdbatch back in
+	 */
+
+	BUG_ON(prev == drawctxt->cmdqueue_tail);
+
+	drawctxt->cmdqueue[prev] = cmdbatch;
+	drawctxt->queued++;
+
+	/* Reset the command queue head to reflect the newly requeued change */
+	drawctxt->cmdqueue_head = prev;
+	mutex_unlock(&drawctxt->mutex);
+}
+
+/**
+ * dispatcher_queue_context() - Queue a context in the dispatcher pending list
+ * @dispatcher: Pointer to the adreno dispatcher struct
+ * @drawctxt: Pointer to the adreno draw context
+ *
+ * Add a context to the dispatcher pending list.
+ */
+static void  dispatcher_queue_context(struct adreno_device *adreno_dev,
+		struct adreno_context *drawctxt)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	spin_lock(&dispatcher->plist_lock);
+
+
+	if (plist_node_empty(&drawctxt->pending)) {
+		/* Get a reference to the context while it sits on the list */
+		_kgsl_context_get(&drawctxt->base);
+		trace_dispatch_queue_context(drawctxt);
+		plist_add(&drawctxt->pending, &dispatcher->pending);
+	}
+
+	spin_unlock(&dispatcher->plist_lock);
+}
+
+/**
+ * sendcmd() - Send a command batch to the GPU hardware
+ * @dispatcher: Pointer to the adreno dispatcher struct
+ * @cmdbatch: Pointer to the KGSL cmdbatch being sent
+ *
+ * Send a KGSL command batch to the GPU hardware
+ */
+static int sendcmd(struct adreno_device *adreno_dev,
+	struct kgsl_cmdbatch *cmdbatch)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+	int ret;
+
+	dispatcher->inflight++;
+
+	mutex_lock(&device->mutex);
+
+	if (dispatcher->inflight == 1) {
+		/* Time to make the donuts.  Turn on the GPU */
+		ret = kgsl_active_count_get(device);
+		if (ret) {
+			dispatcher->inflight--;
+			mutex_unlock(&device->mutex);
+			return ret;
+		}
+	}
+
+	ret = adreno_ringbuffer_submitcmd(adreno_dev, cmdbatch);
+
+	/*
+	 * On the first command, if the submission was successful, then read the
+	 * fault registers.  If it failed then turn off the GPU. Sad face.
+	 */
+
+	if (dispatcher->inflight == 1) {
+		if (ret == 0)
+			fault_detect_read(device);
+		else
+			kgsl_active_count_put(device);
+	}
+
+	mutex_unlock(&device->mutex);
+
+	if (ret) {
+		dispatcher->inflight--;
+		KGSL_DRV_ERR(device,
+			"Unable to submit command to the ringbuffer\n");
+		return ret;
+	}
+
+	trace_adreno_cmdbatch_submitted(cmdbatch, dispatcher->inflight);
+
+	dispatcher->cmdqueue[dispatcher->tail] = cmdbatch;
+	dispatcher->tail = (dispatcher->tail + 1) %
+		ADRENO_DISPATCH_CMDQUEUE_SIZE;
+
+	/*
+	 * If this is the first command in the pipe then the GPU will
+	 * immediately start executing it so we can start the expiry timeout on
+	 * the command batch here.  Subsequent command batches will have their
+	 * timer started when the previous command batch is retired
+	 */
+	if (dispatcher->inflight == 1) {
+		cmdbatch->expires = jiffies +
+			msecs_to_jiffies(_cmdbatch_timeout);
+		mod_timer(&dispatcher->timer, cmdbatch->expires);
+
+		/* Start the fault detection timer */
+		if (adreno_dev->fast_hang_detect)
+			mod_timer(&dispatcher->fault_timer,
+				jiffies +
+				msecs_to_jiffies(_fault_timer_interval));
+	}
+
+	return 0;
+}
+
+/**
+ * dispatcher_context_sendcmds() - Send commands from a context to the GPU
+ * @adreno_dev: Pointer to the adreno device struct
+ * @drawctxt: Pointer to the adreno context to dispatch commands from
+ *
+ * Dequeue and send a burst of commands from the specified context to the GPU
+ */
+static int dispatcher_context_sendcmds(struct adreno_device *adreno_dev,
+		struct adreno_context *drawctxt)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+	int count = 0;
+
+	/*
+	 * Each context can send a specific number of command batches per cycle
+	 */
+	for ( ; count < _context_cmdbatch_burst &&
+		dispatcher->inflight < _dispatcher_inflight; count++) {
+		int ret;
+		struct kgsl_cmdbatch *cmdbatch =
+			adreno_context_get_cmdbatch(drawctxt);
+
+		if (cmdbatch == NULL)
+			break;
+
+		/*
+		 * adreno_context_get_cmdbatch returns -EAGAIN if the current
+		 * cmdbatch has pending sync points so no more to do here.
+		 * When the sync points are satisfied then the context will get
+		 * reqeueued
+		 */
+
+		if (IS_ERR(cmdbatch))
+			return count;
+
+		/*
+		 * If this is a synchronization submission then there are no
+		 * commands to submit.  Discard it and get the next item from
+		 * the queue.  Decrement count so this packet doesn't count
+		 * against the burst for the context
+		 */
+
+		if (cmdbatch->flags & KGSL_CONTEXT_SYNC) {
+			count--;
+			kgsl_cmdbatch_destroy(cmdbatch);
+			continue;
+		}
+
+		ret = sendcmd(adreno_dev, cmdbatch);
+
+		/*
+		 * There are various reasons why we can't submit a command (no
+		 * memory for the commands, full ringbuffer, etc) but none of
+		 * these are actually the current command's fault.  Requeue it
+		 * back on the context and let it come back around again if
+		 * conditions improve
+		 */
+		if (ret) {
+			adreno_context_requeue_cmdbatch(drawctxt, cmdbatch);
+			break;
+		}
+	}
+
+	/*
+	 * If the context successfully submitted commands, then
+	 * unconditionally put it back on the queue to be considered the
+	 * next time around. This might seem a little wasteful but it is
+	 * reasonable to think that a busy context will stay busy.
+	 */
+
+	if (count) {
+		dispatcher_queue_context(adreno_dev, drawctxt);
+
+		/*
+		 * If we submitted something there will be room in the
+		 * context queue so ping the context wait queue on the
+		 * chance that the context is snoozing
+		 */
+
+		wake_up_interruptible_all(&drawctxt->wq);
+	}
+
+	return count;
+}
+
+/**
+ * _adreno_dispatcher_issuecmds() - Issue commmands from pending contexts
+ * @adreno_dev: Pointer to the adreno device struct
+ *
+ * Issue as many commands as possible (up to inflight) from the pending contexts
+ * This function assumes the dispatcher mutex has been locked.
+ */
+static int _adreno_dispatcher_issuecmds(struct adreno_device *adreno_dev)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	/* Don't do anything if the dispatcher is paused */
+	if (dispatcher->state != ADRENO_DISPATCHER_ACTIVE)
+		return 0;
+
+	while (dispatcher->inflight < _dispatcher_inflight) {
+		struct adreno_context *drawctxt = NULL;
+
+		spin_lock(&dispatcher->plist_lock);
+
+		if (!plist_head_empty(&dispatcher->pending)) {
+			drawctxt = plist_first_entry(&dispatcher->pending,
+				struct adreno_context, pending);
+
+			plist_del(&drawctxt->pending, &dispatcher->pending);
+		}
+
+		spin_unlock(&dispatcher->plist_lock);
+
+		if (drawctxt == NULL)
+			break;
+
+		if (kgsl_context_detached(&drawctxt->base) ||
+			drawctxt->state == ADRENO_CONTEXT_STATE_INVALID) {
+			kgsl_context_put(&drawctxt->base);
+			continue;
+		}
+
+		dispatcher_context_sendcmds(adreno_dev, drawctxt);
+		kgsl_context_put(&drawctxt->base);
+	}
+
+	return 0;
+}
+
+/**
+ * adreno_dispatcher_issuecmds() - Issue commmands from pending contexts
+ * @adreno_dev: Pointer to the adreno device struct
+ *
+ * Lock the dispatcher and call _adreno_dispatcher_issueibcmds
+ */
+int adreno_dispatcher_issuecmds(struct adreno_device *adreno_dev)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+	int ret;
+
+	mutex_lock(&dispatcher->mutex);
+	ret = _adreno_dispatcher_issuecmds(adreno_dev);
+	mutex_unlock(&dispatcher->mutex);
+
+	return ret;
+}
+
+static int _check_context_queue(struct adreno_context *drawctxt)
+{
+	int ret;
+
+	mutex_lock(&drawctxt->mutex);
+
+	/*
+	 * Wake up if there is room in the context or if the whole thing got
+	 * invalidated while we were asleep
+	 */
+
+	if (drawctxt->state == ADRENO_CONTEXT_STATE_INVALID)
+		ret = 1;
+	else
+		ret = drawctxt->queued < _context_cmdqueue_size ? 1 : 0;
+
+	mutex_unlock(&drawctxt->mutex);
+
+	return ret;
+}
+
+/**
+ * get_timestamp() - Return the next timestamp for the context
+ * @drawctxt - Pointer to an adreno draw context struct
+ * @cmdbatch - Pointer to a command batch
+ * @timestamp - Pointer to a timestamp value possibly passed from the user
+ *
+ * Assign a timestamp based on the settings of the draw context and the command
+ * batch.
+ */
+static int get_timestamp(struct adreno_context *drawctxt,
+		struct kgsl_cmdbatch *cmdbatch, unsigned int *timestamp)
+{
+	/* Synchronization commands don't get a timestamp */
+	if (cmdbatch->flags & KGSL_CONTEXT_SYNC) {
+		*timestamp = 0;
+		return 0;
+	}
+
+	if (drawctxt->flags & CTXT_FLAGS_USER_GENERATED_TS) {
+		/*
+		 * User specified timestamps need to be greater than the last
+		 * issued timestamp in the context
+		 */
+		if (timestamp_cmp(drawctxt->timestamp, *timestamp) >= 0)
+			return -ERANGE;
+
+		drawctxt->timestamp = *timestamp;
+	} else
+		drawctxt->timestamp++;
+
+	*timestamp = drawctxt->timestamp;
+	return 0;
+}
+
+/**
+ * adreno_context_queue_cmd() - Queue a new command in the context
+ * @adreno_dev: Pointer to the adreno device struct
+ * @drawctxt: Pointer to the adreno draw context
+ * @cmdbatch: Pointer to the command batch being submitted
+ * @timestamp: Pointer to the requested timestamp
+ *
+ * Queue a command in the context - if there isn't any room in the queue, then
+ * block until there is
+ */
+int adreno_context_queue_cmd(struct adreno_device *adreno_dev,
+		struct adreno_context *drawctxt, struct kgsl_cmdbatch *cmdbatch,
+		uint32_t *timestamp)
+{
+	int ret;
+
+	mutex_lock(&drawctxt->mutex);
+
+	if (drawctxt->flags & CTXT_FLAGS_BEING_DESTROYED) {
+		mutex_unlock(&drawctxt->mutex);
+		return -EINVAL;
+	}
+
+	/*
+	 * After skipping to the end of the frame we need to force the preamble
+	 * to run (if it exists) regardless of the context state.
+	 */
+
+	if (drawctxt->flags & CTXT_FLAGS_FORCE_PREAMBLE) {
+		cmdbatch->priv |= CMDBATCH_FLAG_FORCE_PREAMBLE;
+		drawctxt->flags &= ~CTXT_FLAGS_FORCE_PREAMBLE;
+	}
+
+	/*
+	 * If we are waiting for the end of frame and it hasn't appeared yet,
+	 * then mark the command batch as skipped.  It will still progress
+	 * through the pipeline but it won't actually send any commands
+	 */
+
+	if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
+		cmdbatch->priv |= CMDBATCH_FLAG_SKIP;
+
+		/*
+		 * If this command batch represents the EOF then clear the way
+		 * for the dispatcher to continue submitting
+		 */
+
+		if (cmdbatch->flags & KGSL_CONTEXT_END_OF_FRAME) {
+			drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
+
+			/*
+			 * Force the preamble on the next command to ensure that
+			 * the state is correct
+			 */
+
+			drawctxt->flags |= CTXT_FLAGS_FORCE_PREAMBLE;
+		}
+	}
+
+	/* Wait for room in the context queue */
+
+	while (drawctxt->queued >= _context_cmdqueue_size) {
+		trace_adreno_context_sleep(drawctxt);
+		mutex_unlock(&drawctxt->mutex);
+
+		ret = wait_event_interruptible_timeout(drawctxt->wq,
+			_check_context_queue(drawctxt),
+			msecs_to_jiffies(_context_queue_wait));
+
+		mutex_lock(&drawctxt->mutex);
+		trace_adreno_context_wake(drawctxt);
+
+		if (ret <= 0) {
+			mutex_unlock(&drawctxt->mutex);
+			return (ret == 0) ? -ETIMEDOUT : (int) ret;
+		}
+
+		/*
+		 * Account for the possiblity that the context got invalidated
+		 * while we were sleeping
+		 */
+
+		if (drawctxt->state == ADRENO_CONTEXT_STATE_INVALID) {
+			mutex_unlock(&drawctxt->mutex);
+			return -EDEADLK;
+		}
+	}
+
+	ret = get_timestamp(drawctxt, cmdbatch, timestamp);
+	if (ret) {
+		mutex_unlock(&drawctxt->mutex);
+		return ret;
+	}
+
+	cmdbatch->timestamp = *timestamp;
+
+	/* The batch fault policy is the current system fault policy */
+	cmdbatch->fault_policy = adreno_dev->ft_policy;
+
+	/* Put the command into the queue */
+	drawctxt->cmdqueue[drawctxt->cmdqueue_tail] = cmdbatch;
+	drawctxt->cmdqueue_tail = (drawctxt->cmdqueue_tail + 1) %
+		ADRENO_CONTEXT_CMDQUEUE_SIZE;
+
+	drawctxt->queued++;
+	trace_adreno_cmdbatch_queued(cmdbatch, drawctxt->queued);
+
+
+	mutex_unlock(&drawctxt->mutex);
+
+	/* Add the context to the dispatcher pending list */
+	dispatcher_queue_context(adreno_dev, drawctxt);
+
+	/*
+	 * Only issue commands if inflight is less than burst -this prevents us
+	 * from sitting around waiting for the mutex on a busy system - the work
+	 * loop will schedule it for us. Inflight is mutex protected but the
+	 * worse that can happen is that it will go to 0 after we check and if
+	 * it goes to 0 it is because the work loop decremented it and the work
+	 * queue will try to schedule new commands anyway.
+	 */
+
+	if (adreno_dev->dispatcher.inflight < _context_cmdbatch_burst)
+		adreno_dispatcher_issuecmds(adreno_dev);
+
+	return 0;
+}
+
+/*
+ * If an IB inside of the command batch has a gpuaddr that matches the base
+ * passed in then zero the size which effectively skips it when it is submitted
+ * in the ringbuffer.
+ */
+static void cmdbatch_skip_ib(struct kgsl_cmdbatch *cmdbatch, unsigned int base)
+{
+	int i;
+
+	for (i = 0; i < cmdbatch->ibcount; i++) {
+		if (cmdbatch->ibdesc[i].gpuaddr == base) {
+			cmdbatch->ibdesc[i].sizedwords = 0;
+			return;
+		}
+	}
+}
+
+static void cmdbatch_skip_frame(struct kgsl_cmdbatch *cmdbatch,
+	struct kgsl_cmdbatch **replay, int count)
+{
+	struct adreno_context *drawctxt = ADRENO_CONTEXT(cmdbatch->context);
+	int skip = 1;
+	int i;
+
+	for (i = 0; i < count; i++) {
+
+		/*
+		 * Only operate on command batches that belong to the
+		 * faulting context
+		 */
+
+		if (replay[i]->context->id != cmdbatch->context->id)
+			continue;
+
+		/*
+		 * Skip all the command batches in this context until
+		 * the EOF flag is seen.  If the EOF flag is seen then
+		 * force the preamble for the next command.
+		 */
+
+		if (skip) {
+			replay[i]->priv |= CMDBATCH_FLAG_SKIP;
+
+			if (replay[i]->flags & KGSL_CONTEXT_END_OF_FRAME)
+				skip = 0;
+		} else {
+			replay[i]->priv |= CMDBATCH_FLAG_FORCE_PREAMBLE;
+			return;
+		}
+	}
+
+	/*
+	 * If the EOF flag hasn't been seen yet then set the flag in the
+	 * drawctxt to keep looking for it
+	 */
+
+	if (skip && drawctxt)
+		drawctxt->flags |= CTXT_FLAGS_SKIP_EOF;
+
+	/*
+	 * If we did see the EOF flag then force the preamble on for the
+	 * next command issued on this context
+	 */
+
+	if (!skip && drawctxt)
+		drawctxt->flags |= CTXT_FLAGS_FORCE_PREAMBLE;
+}
+
+static void remove_invalidated_cmdbatches(struct kgsl_device *device,
+		struct kgsl_cmdbatch **replay, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		struct kgsl_cmdbatch *cmd = replay[i];
+		struct adreno_context *drawctxt;
+
+		if (cmd == NULL)
+			continue;
+
+		drawctxt = ADRENO_CONTEXT(cmd->context);
+
+		if (kgsl_context_detached(cmd->context) ||
+			drawctxt->state == ADRENO_CONTEXT_STATE_INVALID) {
+			replay[i] = NULL;
+
+			mutex_lock(&device->mutex);
+			kgsl_cancel_events_timestamp(device, cmd->context,
+				cmd->timestamp);
+			mutex_unlock(&device->mutex);
+
+			kgsl_cmdbatch_destroy(cmd);
+		}
+	}
+}
+
+static void dispatcher_do_fault(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+	unsigned int ptr;
+	unsigned int reg, base;
+	struct kgsl_cmdbatch **replay = NULL;
+	struct kgsl_cmdbatch *cmdbatch;
+	int ret, i, count = 0;
+
+	BUG_ON(dispatcher->inflight == 0);
+
+	/* Turn off all the timers */
+	del_timer_sync(&dispatcher->timer);
+	del_timer_sync(&dispatcher->fault_timer);
+
+	mutex_lock(&device->mutex);
+
+	cmdbatch = dispatcher->cmdqueue[dispatcher->head];
+
+	/*
+	 * If the fault was due to a timeout then stop the CP to ensure we don't
+	 * get activity while we are trying to dump the state of the system
+	 */
+
+	if (dispatcher->fault == ADRENO_DISPATCHER_TIMEOUT_FAULT) {
+		kgsl_regread(device, REG_CP_ME_CNTL, &reg);
+		reg |= (1 << 27) | (1 << 28);
+		kgsl_regwrite(device, REG_CP_ME_CNTL, reg);
+
+		/* Skip the PM dump for a timeout because it confuses people */
+		cmdbatch->fault_policy |= KGSL_FT_SKIP_PMDUMP;
+	}
+
+	kgsl_regread(device, REG_CP_IB1_BASE, &base);
+
+	/*
+	 * Dump the postmortem and snapshot information if this is the first
+	 * detected fault for the oldest active command batch
+	 */
+
+	if (!(cmdbatch->fault_policy & KGSL_FT_SKIP_PMDUMP)) {
+		kgsl_postmortem_dump(device, 0);
+		kgsl_device_snapshot(device, 1);
+	}
+
+	mutex_unlock(&device->mutex);
+
+	/* Allocate memory to store the inflight commands */
+	replay = kzalloc(sizeof(*replay) * dispatcher->inflight, GFP_KERNEL);
+
+	if (replay == NULL) {
+		unsigned int ptr = dispatcher->head;
+
+		while (ptr != dispatcher->tail) {
+			struct kgsl_context *context =
+				dispatcher->cmdqueue[ptr]->context;
+
+			adreno_drawctxt_invalidate(device, context);
+			kgsl_cmdbatch_destroy(dispatcher->cmdqueue[ptr]);
+
+			ptr = CMDQUEUE_NEXT(ptr, ADRENO_DISPATCH_CMDQUEUE_SIZE);
+		}
+
+		/*
+		 * Set the replay count to zero - this will ensure that the
+		 * hardware gets reset but nothing else goes played
+		 */
+
+		count = 0;
+		goto replay;
+	}
+
+	/* Copy the inflight command batches into the temporary storage */
+	ptr = dispatcher->head;
+
+	while (ptr != dispatcher->tail) {
+		replay[count++] = dispatcher->cmdqueue[ptr];
+		ptr = CMDQUEUE_NEXT(ptr, ADRENO_DISPATCH_CMDQUEUE_SIZE);
+	}
+
+	/*
+	 * For the purposes of replay, we assume that the oldest command batch
+	 * that hasn't retired a timestamp is "hung".
+	 */
+
+	cmdbatch = replay[0];
+
+	/*
+	 * Set a flag so we don't print another PM dump if the cmdbatch fails
+	 * again on replay
+	 */
+
+	cmdbatch->fault_policy |= KGSL_FT_SKIP_PMDUMP;
+
+	/*
+	 * A hardware fault generally means something was deterministically
+	 * wrong with the command batch - no point in trying to replay it
+	 * Clear the replay bit and move on to the next policy level
+	 */
+
+	if (dispatcher->fault == ADRENO_DISPATCHER_HARD_FAULT)
+		cmdbatch->fault_policy &= ~KGSL_FT_REPLAY;
+
+	/*
+	 * A timeout fault means the IB timed out - don't be silly and replay
+	 * it, because it will probably timeout again
+	 */
+
+	if (dispatcher->fault == ADRENO_DISPATCHER_TIMEOUT_FAULT)
+		cmdbatch->fault_policy &= ~KGSL_FT_REPLAY;
+
+	/*
+	 * Execute the fault tolerance policy. Each command batch stores the
+	 * current fault policy that was set when it was queued.
+	 * As the options are tried in descending priority
+	 * (REPLAY -> SKIPIBS -> SKIPFRAME -> NOTHING) the bits are cleared
+	 * from the cmdbatch policy so the next thing can be tried if the
+	 * change comes around again
+	 */
+
+	/* Replay the hanging command batch again */
+	if (cmdbatch->fault_policy & KGSL_FT_REPLAY) {
+		cmdbatch->fault_policy &= ~KGSL_FT_REPLAY;
+		goto replay;
+	}
+
+	/*
+	 * Skip the last IB1 that was played but replay everything else.
+	 * Note that the last IB1 might not be in the "hung" command batch
+	 * because the CP may have caused a page-fault while it was prefetching
+	 * the next IB1/IB2. walk all outstanding commands and zap the
+	 * supposedly bad IB1 where ever it lurks.
+	 */
+
+	if (cmdbatch->fault_policy & KGSL_FT_SKIPIB) {
+		cmdbatch->fault_policy &= ~KGSL_FT_SKIPIB;
+
+		for (i = 0; i < count; i++) {
+			if (replay[i] != NULL)
+				cmdbatch_skip_ib(replay[i], base);
+		}
+
+		goto replay;
+	}
+
+	if (cmdbatch->fault_policy & KGSL_FT_SKIPFRAME) {
+
+		cmdbatch->fault_policy &= ~KGSL_FT_SKIPFRAME;
+
+		/*
+		 * Skip all the pending command batches for this context until
+		 * the EOF frame is seen
+		 */
+		cmdbatch_skip_frame(cmdbatch, replay, count);
+		goto replay;
+	}
+
+	/* If we get here then all the policies failed or FT is disabled */
+
+	/* Invalidate the context */
+	adreno_drawctxt_invalidate(device, cmdbatch->context);
+
+	/* Remove any pending command batches that have been invalidated */
+	remove_invalidated_cmdbatches(device, replay, count);
+
+replay:
+	/* Reset the dispatcher queue */
+	dispatcher->inflight = 0;
+	dispatcher->head = dispatcher->tail = 0;
+
+	/* Reset the GPU */
+	mutex_lock(&device->mutex);
+	ret = adreno_reset(device);
+	mutex_unlock(&device->mutex);
+
+	/* If adreno_reset() fails then what hope do we have for the future? */
+	BUG_ON(ret);
+
+	/*
+	 * Force the preamble on the first command (if applicable) to avoid any
+	 * strange stage issues
+	 */
+
+	if (replay[0])
+		replay[0]->priv |= CMDBATCH_FLAG_FORCE_PREAMBLE;
+
+	/* Replay the pending command buffers */
+	for (i = 0; i < count; i++) {
+
+		int ret;
+
+		if (replay[i] == NULL)
+			continue;
+
+		/*
+		 * Force each command batch to wait for idle - this avoids wierd
+		 * CP parse issues
+		 */
+
+		replay[i]->flags |=  KGSL_CMD_FLAGS_WFI;
+
+		ret = sendcmd(adreno_dev, replay[i]);
+
+		/*
+		 * If sending the command fails, then try to recover by
+		 * invalidating the context
+		 */
+
+		if (ret) {
+			adreno_drawctxt_invalidate(device, cmdbatch->context);
+
+			remove_invalidated_cmdbatches(device, &replay[i],
+				count - i);
+		}
+	}
+
+	mutex_lock(&device->mutex);
+	kgsl_active_count_put(device);
+	mutex_unlock(&device->mutex);
+
+	kfree(replay);
+}
+
+static inline int cmdbatch_consumed(struct kgsl_cmdbatch *cmdbatch,
+		unsigned int consumed, unsigned int retired)
+{
+	return ((timestamp_cmp(cmdbatch->timestamp, consumed) >= 0) &&
+		(timestamp_cmp(retired, cmdbatch->timestamp) < 0));
+}
+
+/**
+ * adreno_dispatcher_work() - Master work handler for the dispatcher
+ * @work: Pointer to the work struct for the current work queue
+ *
+ * Process expired commands and send new ones.
+ */
+static void adreno_dispatcher_work(struct work_struct *work)
+{
+	struct adreno_dispatcher *dispatcher =
+		container_of(work, struct adreno_dispatcher, work);
+	struct adreno_device *adreno_dev =
+		container_of(dispatcher, struct adreno_device, dispatcher);
+	struct kgsl_device *device = &adreno_dev->dev;
+	int count = 0;
+
+	mutex_lock(&dispatcher->mutex);
+
+	while (dispatcher->head != dispatcher->tail) {
+		uint32_t consumed, retired = 0;
+		struct kgsl_cmdbatch *cmdbatch =
+			dispatcher->cmdqueue[dispatcher->head];
+		struct adreno_context *drawctxt;
+		BUG_ON(cmdbatch == NULL);
+
+		drawctxt = ADRENO_CONTEXT(cmdbatch->context);
+
+		/*
+		 * First try to expire the timestamp. This happens if the
+		 * context is valid and the timestamp expired normally or if the
+		 * context was destroyed before the command batch was finished
+		 * in the GPU.  Either way retire the command batch advance the
+		 * pointers and continue processing the queue
+		 */
+
+		if (!kgsl_context_detached(cmdbatch->context))
+			retired = kgsl_readtimestamp(device, cmdbatch->context,
+				KGSL_TIMESTAMP_RETIRED);
+
+		if (kgsl_context_detached(cmdbatch->context) ||
+			(timestamp_cmp(cmdbatch->timestamp, retired) <= 0)) {
+
+			trace_adreno_cmdbatch_retired(cmdbatch,
+				dispatcher->inflight - 1);
+
+			/* Reduce the number of inflight command batches */
+			dispatcher->inflight--;
+
+			/* Zero the old entry*/
+			dispatcher->cmdqueue[dispatcher->head] = NULL;
+
+			/* Advance the buffer head */
+			dispatcher->head = CMDQUEUE_NEXT(dispatcher->head,
+				ADRENO_DISPATCH_CMDQUEUE_SIZE);
+
+			/* Destroy the retired command batch */
+			kgsl_cmdbatch_destroy(cmdbatch);
+
+			/* Update the expire time for the next command batch */
+
+			if (dispatcher->inflight > 0) {
+				cmdbatch =
+					dispatcher->cmdqueue[dispatcher->head];
+				cmdbatch->expires = jiffies +
+					msecs_to_jiffies(_cmdbatch_timeout);
+			}
+
+			count++;
+			continue;
+		}
+
+		/*
+		 * If we got a fault from the interrupt handler, this command
+		 * is to blame.  Invalidate it, reset and replay
+		 */
+
+		if (dispatcher->fault) {
+			dispatcher_do_fault(device);
+			dispatcher->fault = 0;
+			goto done;
+		}
+
+		/* Get the last consumed timestamp */
+		consumed = kgsl_readtimestamp(device, cmdbatch->context,
+			KGSL_TIMESTAMP_CONSUMED);
+
+		/*
+		 * Break here if fault detection is dsiabled for the context or
+		 * if the long running IB detection is disabled device wide
+		 * Long running command buffers will be allowed to run to
+		 * completion - but badly behaving command buffers (infinite
+		 * shaders etc) can end up running forever.
+		 */
+
+		if (!adreno_dev->long_ib_detect ||
+			drawctxt->flags & CTXT_FLAGS_NO_FAULT_TOLERANCE)
+			break;
+
+		/*
+		 * The last line of defense is to check if the command batch has
+		 * timed out. If we get this far but the timeout hasn't expired
+		 * yet then the GPU is still ticking away
+		 */
+
+		if (time_is_after_jiffies(cmdbatch->expires))
+			break;
+
+		/* Boom goes the dynamite */
+
+		KGSL_DRV_ERR(device,
+			"Context %d, timestamp %d ran too long\n",
+			drawctxt->base.id, drawctxt->timestamp);
+
+		dispatcher->fault = ADRENO_DISPATCHER_TIMEOUT_FAULT;
+
+		dispatcher_do_fault(device);
+		break;
+	}
+
+	/*
+	 * Decrement the active count to 0 - this will allow the system to go
+	 * into suspend even if there are queued command batches
+	 */
+
+	if (count && dispatcher->inflight == 0) {
+		mutex_lock(&device->mutex);
+		kgsl_active_count_put(device);
+		mutex_unlock(&device->mutex);
+	}
+
+	/* Dispatch new commands if we have the room */
+	if (dispatcher->inflight < _dispatcher_inflight)
+		_adreno_dispatcher_issuecmds(adreno_dev);
+
+done:
+	/* Either update the timer for the next command batch or disable it */
+	if (dispatcher->inflight) {
+		struct kgsl_cmdbatch *cmdbatch
+			= dispatcher->cmdqueue[dispatcher->head];
+
+		/* Update the timeout timer for the next command batch */
+		mod_timer(&dispatcher->timer, cmdbatch->expires);
+	} else {
+		del_timer_sync(&dispatcher->timer);
+		del_timer_sync(&dispatcher->fault_timer);
+	}
+
+	/* Before leaving update the pwrscale information */
+	mutex_lock(&device->mutex);
+	kgsl_pwrscale_idle(device);
+	mutex_unlock(&device->mutex);
+
+	mutex_unlock(&dispatcher->mutex);
+}
+
+void adreno_dispatcher_schedule(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	queue_work(device->work_queue, &dispatcher->work);
+}
+
+/**
+ * adreno_dispatcher_queue_context() - schedule a drawctxt in the dispatcher
+ * device: pointer to the KGSL device
+ * drawctxt: pointer to the drawctxt to schedule
+ *
+ * Put a draw context on the dispatcher pending queue and schedule the
+ * dispatcher. This is used to reschedule changes that might have been blocked
+ * for sync points or other concerns
+ */
+void adreno_dispatcher_queue_context(struct kgsl_device *device,
+	struct adreno_context *drawctxt)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+	dispatcher_queue_context(adreno_dev, drawctxt);
+	adreno_dispatcher_schedule(device);
+}
+
+/*
+ * This is called on a regular basis while command batches are inflight.  Fault
+ * detection registers are read and compared to the existing values - if they
+ * changed then the GPU is still running.  If they are the same between
+ * subsequent calls then the GPU may have faulted
+ */
+
+void adreno_dispatcher_fault_timer(unsigned long data)
+{
+	struct adreno_device *adreno_dev = (struct adreno_device *) data;
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	/* Leave if the user decided to turn off fast hang detection */
+	if (adreno_dev->fast_hang_detect == 0)
+		return;
+
+	/* Don't do anything if the dispatcher is idle or faulted */
+	if (dispatcher->inflight == 0 || dispatcher->fault)
+		return;
+
+	/* Make sure the device is active before trying a read */
+	if (device->state != KGSL_STATE_ACTIVE)
+		return;
+
+	/*
+	 * Read the fault registers - if it returns 0 then they haven't changed
+	 * so mark the dispatcher as faulted and schedule the work loop.
+	 */
+
+	if (!fault_detect_read_compare(device)) {
+		dispatcher->fault = ADRENO_DISPATCHER_SOFT_FAULT;
+		adreno_dispatcher_schedule(device);
+	} else {
+		mod_timer(&dispatcher->fault_timer,
+			jiffies + msecs_to_jiffies(_fault_timer_interval));
+	}
+}
+
+/*
+ * This is called when the timer expires - it either means the GPU is hung or
+ * the IB is taking too long to execute
+ */
+void adreno_dispatcher_timer(unsigned long data)
+{
+	struct adreno_device *adreno_dev = (struct adreno_device *) data;
+	struct kgsl_device *device = &adreno_dev->dev;
+
+	adreno_dispatcher_schedule(device);
+}
+/**
+ * adreno_dispatcher_irq_fault() - Trigger a fault in the dispatcher
+ * @device: Pointer to the KGSL device
+ *
+ * Called from an interrupt context this will trigger a fault in the
+ * dispatcher for the oldest pending command batch
+ */
+void adreno_dispatcher_irq_fault(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	dispatcher->fault = ADRENO_DISPATCHER_HARD_FAULT;
+	adreno_dispatcher_schedule(device);
+}
+
+/**
+ * adreno_dispatcher_pause() - stop the dispatcher
+ * @adreno_dev: pointer to the adreno device structure
+ *
+ * Pause the dispather so it doesn't accept any new commands
+ */
+void adreno_dispatcher_pause(struct adreno_device *adreno_dev)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	/*
+	 * This will probably get called while holding other mutexes so don't
+	 * take the dispatcher mutex.  The biggest penalty is that another
+	 * command might be submitted while we are in here but thats okay
+	 * because whoever is waiting for the drain will just have another
+	 * command batch to wait for
+	 */
+
+	dispatcher->state = ADRENO_DISPATCHER_PAUSE;
+}
+
+/**
+ * adreno_dispatcher_start() - activate the dispatcher
+ * @adreno_dev: pointer to the adreno device structure
+ *
+ * Set the disaptcher active and start the loop once to get things going
+ */
+void adreno_dispatcher_start(struct adreno_device *adreno_dev)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	dispatcher->state = ADRENO_DISPATCHER_ACTIVE;
+
+	/* Schedule the work loop to get things going */
+	adreno_dispatcher_schedule(&adreno_dev->dev);
+}
+
+/**
+ * adreno_dispatcher_stop() - stop the dispatcher
+ * @adreno_dev: pointer to the adreno device structure
+ *
+ * Stop the dispatcher and close all the timers
+ */
+void adreno_dispatcher_stop(struct adreno_device *adreno_dev)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	del_timer_sync(&dispatcher->timer);
+	del_timer_sync(&dispatcher->fault_timer);
+}
+
+/**
+ * adreno_dispatcher_close() - close the dispatcher
+ * @adreno_dev: pointer to the adreno device structure
+ *
+ * Close the dispatcher and free all the oustanding commands and memory
+ */
+void adreno_dispatcher_close(struct adreno_device *adreno_dev)
+{
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+	mutex_lock(&dispatcher->mutex);
+	del_timer_sync(&dispatcher->timer);
+	del_timer_sync(&dispatcher->fault_timer);
+
+	while (dispatcher->head != dispatcher->tail) {
+		kgsl_cmdbatch_destroy(dispatcher->cmdqueue[dispatcher->head]);
+		dispatcher->head = (dispatcher->head + 1)
+			% ADRENO_DISPATCH_CMDQUEUE_SIZE;
+	}
+
+	kfree(fault_detect_regs);
+	fault_detect_regs = NULL;
+
+	mutex_unlock(&dispatcher->mutex);
+
+	kobject_put(&dispatcher->kobj);
+}
+
+struct dispatcher_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct adreno_dispatcher *,
+			struct dispatcher_attribute *, char *);
+	ssize_t (*store)(struct adreno_dispatcher *,
+			struct dispatcher_attribute *, const char *buf,
+			size_t count);
+	unsigned int max;
+	unsigned int *value;
+};
+
+#define DISPATCHER_UINT_ATTR(_name, _mode, _max, _value) \
+	struct dispatcher_attribute dispatcher_attr_##_name =  { \
+		.attr = { .name = __stringify(_name), .mode = _mode }, \
+		.show = _show_uint, \
+		.store = _store_uint, \
+		.max = _max, \
+		.value = &(_value), \
+	}
+
+#define to_dispatcher_attr(_a) \
+	container_of((_a), struct dispatcher_attribute, attr)
+#define to_dispatcher(k) container_of(k, struct adreno_dispatcher, kobj)
+
+static ssize_t _store_uint(struct adreno_dispatcher *dispatcher,
+		struct dispatcher_attribute *attr,
+		const char *buf, size_t size)
+{
+	unsigned long val;
+	int ret = kstrtoul(buf, 0, &val);
+
+	if (ret)
+		return ret;
+
+	if (!val || (attr->max && (val > attr->max)))
+		return -EINVAL;
+
+	*((unsigned int *) attr->value) = val;
+	return size;
+}
+
+static ssize_t _show_uint(struct adreno_dispatcher *dispatcher,
+		struct dispatcher_attribute *attr,
+		char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+		*((unsigned int *) attr->value));
+}
+
+static DISPATCHER_UINT_ATTR(inflight, 0644, ADRENO_DISPATCH_CMDQUEUE_SIZE,
+	_dispatcher_inflight);
+/*
+ * Our code that "puts back" a command from the context is much cleaner
+ * if we are sure that there will always be enough room in the
+ * ringbuffer so restrict the maximum size of the context queue to
+ * ADRENO_CONTEXT_CMDQUEUE_SIZE - 1
+ */
+static DISPATCHER_UINT_ATTR(context_cmdqueue_size, 0644,
+	ADRENO_CONTEXT_CMDQUEUE_SIZE - 1, _context_cmdqueue_size);
+static DISPATCHER_UINT_ATTR(context_burst_count, 0644, 0,
+	_context_cmdbatch_burst);
+static DISPATCHER_UINT_ATTR(cmdbatch_timeout, 0644, 0, _cmdbatch_timeout);
+static DISPATCHER_UINT_ATTR(context_queue_wait, 0644, 0, _context_queue_wait);
+static DISPATCHER_UINT_ATTR(fault_detect_interval, 0644, 0,
+	_fault_timer_interval);
+
+static struct attribute *dispatcher_attrs[] = {
+	&dispatcher_attr_inflight.attr,
+	&dispatcher_attr_context_cmdqueue_size.attr,
+	&dispatcher_attr_context_burst_count.attr,
+	&dispatcher_attr_cmdbatch_timeout.attr,
+	&dispatcher_attr_context_queue_wait.attr,
+	&dispatcher_attr_fault_detect_interval.attr,
+	NULL,
+};
+
+static ssize_t dispatcher_sysfs_show(struct kobject *kobj,
+				   struct attribute *attr, char *buf)
+{
+	struct adreno_dispatcher *dispatcher = to_dispatcher(kobj);
+	struct dispatcher_attribute *pattr = to_dispatcher_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (pattr->show)
+		ret = pattr->show(dispatcher, pattr, buf);
+
+	return ret;
+}
+
+static ssize_t dispatcher_sysfs_store(struct kobject *kobj,
+				    struct attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct adreno_dispatcher *dispatcher = to_dispatcher(kobj);
+	struct dispatcher_attribute *pattr = to_dispatcher_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (pattr->store)
+		ret = pattr->store(dispatcher, pattr, buf, count);
+
+	return ret;
+}
+
+static void dispatcher_sysfs_release(struct kobject *kobj)
+{
+}
+
+static const struct sysfs_ops dispatcher_sysfs_ops = {
+	.show = dispatcher_sysfs_show,
+	.store = dispatcher_sysfs_store
+};
+
+static struct kobj_type ktype_dispatcher = {
+	.sysfs_ops = &dispatcher_sysfs_ops,
+	.default_attrs = dispatcher_attrs,
+	.release = dispatcher_sysfs_release
+};
+
+/**
+ * adreno_dispatcher_init() - Initialize the dispatcher
+ * @adreno_dev: pointer to the adreno device structure
+ *
+ * Initialize the dispatcher
+ */
+int adreno_dispatcher_init(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+	int ret;
+
+	memset(dispatcher, 0, sizeof(*dispatcher));
+
+	mutex_init(&dispatcher->mutex);
+
+	setup_timer(&dispatcher->timer, adreno_dispatcher_timer,
+		(unsigned long) adreno_dev);
+
+	setup_timer(&dispatcher->fault_timer, adreno_dispatcher_fault_timer,
+		(unsigned long) adreno_dev);
+
+	INIT_WORK(&dispatcher->work, adreno_dispatcher_work);
+
+	plist_head_init(&dispatcher->pending);
+	spin_lock_init(&dispatcher->plist_lock);
+
+	dispatcher->state = ADRENO_DISPATCHER_ACTIVE;
+
+	ret = kobject_init_and_add(&dispatcher->kobj, &ktype_dispatcher,
+		&device->dev->kobj, "dispatch");
+
+	fault_detect_regs = kzalloc(FT_DETECT_REGS_COUNT * sizeof(unsigned int),
+		GFP_KERNEL);
+
+	if (fault_detect_regs == NULL)
+		ret = -ENOMEM;
+
+	return ret;
+}
diff --git a/drivers/gpu/msm/adreno_drawctxt.c b/drivers/gpu/msm/adreno_drawctxt.c
index 8f0bca27b0f2d60276ff59a120b6c95a80eb795f..55802c62fe07a5e8f09565d51ac3c2ac344bb1aa 100644
--- a/drivers/gpu/msm/adreno_drawctxt.c
+++ b/drivers/gpu/msm/adreno_drawctxt.c
@@ -13,10 +13,12 @@
 
 #include <linux/slab.h>
 #include <linux/msm_kgsl.h>
+#include <linux/sched.h>
 
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
 #include "adreno.h"
+#include "adreno_trace.h"
 
 #define KGSL_INIT_REFTIMESTAMP		0x7FFFFFFF
 
@@ -132,6 +134,245 @@ void build_quad_vtxbuff(struct adreno_context *drawctxt,
 	*incmd = cmd;
 }
 
+static void wait_callback(struct kgsl_device *device, void *priv, u32 id,
+		u32 timestamp, u32 type)
+{
+	struct adreno_context *drawctxt = priv;
+	wake_up_interruptible_all(&drawctxt->waiting);
+}
+
+#define adreno_wait_event_interruptible_timeout(wq, condition, timeout, io)   \
+({                                                                            \
+	long __ret = timeout;                                                 \
+	if (io)                                                               \
+		__wait_io_event_interruptible_timeout(wq, condition, __ret);  \
+	else                                                                  \
+		__wait_event_interruptible_timeout(wq, condition, __ret);     \
+	__ret;                                                                \
+})
+
+#define adreno_wait_event_interruptible(wq, condition, io)                    \
+({                                                                            \
+	long __ret;                                                           \
+	if (io)                                                               \
+		__wait_io_event_interruptible(wq, condition, __ret);          \
+	else                                                                  \
+		__wait_event_interruptible(wq, condition, __ret);             \
+	__ret;                                                                \
+})
+
+static int _check_context_timestamp(struct kgsl_device *device,
+		struct adreno_context *drawctxt, unsigned int timestamp)
+{
+	int ret = 0;
+
+	/* Bail if the drawctxt has been invalidated or destroyed */
+	if (kgsl_context_detached(&drawctxt->base) ||
+		drawctxt->state != ADRENO_CONTEXT_STATE_ACTIVE)
+		return 1;
+
+	mutex_lock(&device->mutex);
+	ret = kgsl_check_timestamp(device, &drawctxt->base, timestamp);
+	mutex_unlock(&device->mutex);
+
+	return ret;
+}
+
+/**
+ * adreno_drawctxt_wait() - sleep until a timestamp expires
+ * @adreno_dev: pointer to the adreno_device struct
+ * @drawctxt: Pointer to the draw context to sleep for
+ * @timetamp: Timestamp to wait on
+ * @timeout: Number of jiffies to wait (0 for infinite)
+ *
+ * Register an event to wait for a timestamp on a context and sleep until it
+ * has past.  Returns < 0 on error, -ETIMEDOUT if the timeout expires or 0
+ * on success
+ */
+int adreno_drawctxt_wait(struct adreno_device *adreno_dev,
+		struct kgsl_context *context,
+		uint32_t timestamp, unsigned int timeout)
+{
+	static unsigned int io_cnt;
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct kgsl_pwrctrl *pwr = &device->pwrctrl;
+	struct adreno_context *drawctxt = ADRENO_CONTEXT(context);
+	int ret, io;
+
+	if (kgsl_context_detached(context))
+		return -EINVAL;
+
+	if (drawctxt->state == ADRENO_CONTEXT_STATE_INVALID)
+		return -EDEADLK;
+
+	/* Needs to hold the device mutex */
+	BUG_ON(!mutex_is_locked(&device->mutex));
+
+	trace_adreno_drawctxt_wait_start(context->id, timestamp);
+
+	ret = kgsl_add_event(device, context->id, timestamp,
+		wait_callback, drawctxt, NULL);
+	if (ret)
+		goto done;
+
+	/*
+	 * For proper power accounting sometimes we need to call
+	 * io_wait_interruptible_timeout and sometimes we need to call
+	 * plain old wait_interruptible_timeout. We call the regular
+	 * timeout N times out of 100, where N is a number specified by
+	 * the current power level
+	 */
+
+	io_cnt = (io_cnt + 1) % 100;
+	io = (io_cnt < pwr->pwrlevels[pwr->active_pwrlevel].io_fraction)
+		? 0 : 1;
+
+	mutex_unlock(&device->mutex);
+
+	if (timeout) {
+		ret = (int) adreno_wait_event_interruptible_timeout(
+			drawctxt->waiting,
+			_check_context_timestamp(device, drawctxt, timestamp),
+			msecs_to_jiffies(timeout), io);
+
+		if (ret == 0)
+			ret = -ETIMEDOUT;
+		else if (ret > 0)
+			ret = 0;
+	} else {
+		ret = (int) adreno_wait_event_interruptible(drawctxt->waiting,
+			_check_context_timestamp(device, drawctxt, timestamp),
+				io);
+	}
+
+	mutex_lock(&device->mutex);
+
+	/* -EDEADLK if the context was invalidated while we were waiting */
+	if (drawctxt->state == ADRENO_CONTEXT_STATE_INVALID)
+		ret = -EDEADLK;
+
+
+	/* Return -EINVAL if the context was detached while we were waiting */
+	if (kgsl_context_detached(context))
+		ret = -EINVAL;
+
+done:
+	trace_adreno_drawctxt_wait_done(context->id, timestamp, ret);
+	return ret;
+}
+
+static void global_wait_callback(struct kgsl_device *device, void *priv, u32 id,
+		u32 timestamp, u32 type)
+{
+	struct adreno_context *drawctxt = priv;
+
+	wake_up_interruptible_all(&drawctxt->waiting);
+	kgsl_context_put(&drawctxt->base);
+}
+
+static int _check_global_timestamp(struct kgsl_device *device,
+		unsigned int timestamp)
+{
+	int ret;
+
+	mutex_lock(&device->mutex);
+	ret = kgsl_check_timestamp(device, NULL, timestamp);
+	mutex_unlock(&device->mutex);
+
+	return ret;
+}
+
+int adreno_drawctxt_wait_global(struct adreno_device *adreno_dev,
+		struct kgsl_context *context,
+		uint32_t timestamp, unsigned int timeout)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct adreno_context *drawctxt = ADRENO_CONTEXT(context);
+	int ret;
+
+	/* Needs to hold the device mutex */
+	BUG_ON(!mutex_is_locked(&device->mutex));
+
+	_kgsl_context_get(context);
+
+	trace_adreno_drawctxt_wait_start(KGSL_MEMSTORE_GLOBAL, timestamp);
+
+	ret = kgsl_add_event(device, KGSL_MEMSTORE_GLOBAL, timestamp,
+		global_wait_callback, drawctxt, NULL);
+	if (ret) {
+		kgsl_context_put(context);
+		goto done;
+	}
+
+	mutex_unlock(&device->mutex);
+
+	if (timeout) {
+		ret = (int) wait_event_interruptible_timeout(drawctxt->waiting,
+			_check_global_timestamp(device, timestamp),
+			msecs_to_jiffies(timeout));
+
+		if (ret == 0)
+			ret = -ETIMEDOUT;
+		else if (ret > 0)
+			ret = 0;
+	} else {
+		ret = (int) wait_event_interruptible(drawctxt->waiting,
+			_check_global_timestamp(device, timestamp));
+	}
+
+	mutex_lock(&device->mutex);
+
+	if (ret)
+		kgsl_cancel_events_timestamp(device, NULL, timestamp);
+
+done:
+	trace_adreno_drawctxt_wait_done(KGSL_MEMSTORE_GLOBAL, timestamp, ret);
+	return ret;
+}
+
+/**
+ * adreno_drawctxt_invalidate() - Invalidate an adreno draw context
+ * @device: Pointer to the KGSL device structure for the GPU
+ * @context: Pointer to the KGSL context structure
+ *
+ * Invalidate the context and remove all queued commands and cancel any pending
+ * waiters
+ */
+void adreno_drawctxt_invalidate(struct kgsl_device *device,
+		struct kgsl_context *context)
+{
+	struct adreno_context *drawctxt = ADRENO_CONTEXT(context);
+
+	drawctxt->state = ADRENO_CONTEXT_STATE_INVALID;
+
+	/* Clear the pending queue */
+	mutex_lock(&drawctxt->mutex);
+
+	while (drawctxt->cmdqueue_head != drawctxt->cmdqueue_tail) {
+		struct kgsl_cmdbatch *cmdbatch =
+			drawctxt->cmdqueue[drawctxt->cmdqueue_head];
+
+		drawctxt->cmdqueue_head = (drawctxt->cmdqueue_head + 1) %
+			ADRENO_CONTEXT_CMDQUEUE_SIZE;
+
+		mutex_unlock(&drawctxt->mutex);
+
+		mutex_lock(&device->mutex);
+		kgsl_cancel_events_timestamp(device, context,
+			cmdbatch->timestamp);
+		mutex_unlock(&device->mutex);
+
+		kgsl_cmdbatch_destroy(cmdbatch);
+		mutex_lock(&drawctxt->mutex);
+	}
+
+	mutex_unlock(&drawctxt->mutex);
+
+	/* Give the bad news to everybody waiting around */
+	wake_up_interruptible_all(&drawctxt->waiting);
+	wake_up_interruptible_all(&drawctxt->wq);
+}
+
 /**
  * adreno_drawctxt_create - create a new adreno draw context
  * @device - KGSL device to create the context on
@@ -142,48 +383,60 @@ void build_quad_vtxbuff(struct adreno_context *drawctxt,
  * Create a new draw context for the 3D core.  Return 0 on success,
  * or error code on failure.
  */
-int adreno_drawctxt_create(struct kgsl_device *device,
-			struct kgsl_pagetable *pagetable,
-			struct kgsl_context *context, uint32_t *flags)
+	struct kgsl_context *
+adreno_drawctxt_create(struct kgsl_device_private *dev_priv,
+		uint32_t *flags)
 {
 	struct adreno_context *drawctxt;
+	struct kgsl_device *device = dev_priv->device;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	int ret;
 
 	drawctxt = kzalloc(sizeof(struct adreno_context), GFP_KERNEL);
 
 	if (drawctxt == NULL)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
+
+	ret = kgsl_context_init(dev_priv, &drawctxt->base);
+	if (ret != 0) {
+		kfree(drawctxt);
+		return ERR_PTR(ret);
+	}
 
-	drawctxt->pid = task_pid_nr(current);
-	strlcpy(drawctxt->pid_name, current->comm, TASK_COMM_LEN);
-	drawctxt->pagetable = pagetable;
 	drawctxt->bin_base_offset = 0;
-	drawctxt->id = context->id;
 	drawctxt->timestamp = 0;
 
 	*flags &= (KGSL_CONTEXT_PREAMBLE |
 		KGSL_CONTEXT_NO_GMEM_ALLOC |
 		KGSL_CONTEXT_PER_CONTEXT_TS |
 		KGSL_CONTEXT_USER_GENERATED_TS |
+		KGSL_CONTEXT_NO_FAULT_TOLERANCE |
 		KGSL_CONTEXT_TYPE_MASK);
 
+	/* Always enable per-context timestamps */
+	*flags |= KGSL_CONTEXT_PER_CONTEXT_TS;
+	drawctxt->flags |= CTXT_FLAGS_PER_CONTEXT_TS;
+
 	if (*flags & KGSL_CONTEXT_PREAMBLE)
 		drawctxt->flags |= CTXT_FLAGS_PREAMBLE;
 
 	if (*flags & KGSL_CONTEXT_NO_GMEM_ALLOC)
 		drawctxt->flags |= CTXT_FLAGS_NOGMEMALLOC;
 
-	if (*flags & KGSL_CONTEXT_PER_CONTEXT_TS)
-		drawctxt->flags |= CTXT_FLAGS_PER_CONTEXT_TS;
-
-	if (*flags & KGSL_CONTEXT_USER_GENERATED_TS) {
-		if (!(*flags & KGSL_CONTEXT_PER_CONTEXT_TS)) {
-			ret = -EINVAL;
-			goto err;
-		}
+	if (*flags & KGSL_CONTEXT_USER_GENERATED_TS)
 		drawctxt->flags |= CTXT_FLAGS_USER_GENERATED_TS;
-	}
+
+	mutex_init(&drawctxt->mutex);
+	init_waitqueue_head(&drawctxt->wq);
+	init_waitqueue_head(&drawctxt->waiting);
+
+	/*
+	 * Set up the plist node for the dispatcher.  For now all contexts have
+	 * the same priority, but later the priority will be set at create time
+	 * by the user
+	 */
+
+	plist_node_init(&drawctxt->pending, ADRENO_CONTEXT_DEFAULT_PRIORITY);
 
 	if (*flags & KGSL_CONTEXT_NO_FAULT_TOLERANCE)
 		drawctxt->flags |= CTXT_FLAGS_NO_FAULT_TOLERANCE;
@@ -196,43 +449,52 @@ int adreno_drawctxt_create(struct kgsl_device *device,
 		goto err;
 
 	kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(drawctxt->id, ref_wait_ts),
-			KGSL_INIT_REFTIMESTAMP);
-	kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(drawctxt->id, ts_cmp_enable), 0);
+			KGSL_MEMSTORE_OFFSET(drawctxt->base.id, soptimestamp),
+			0);
 	kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(drawctxt->id, soptimestamp), 0);
-	kgsl_sharedmem_writel(&device->memstore,
-			KGSL_MEMSTORE_OFFSET(drawctxt->id, eoptimestamp), 0);
+			KGSL_MEMSTORE_OFFSET(drawctxt->base.id, eoptimestamp),
+			0);
 
-	context->devctxt = drawctxt;
-	return 0;
+	return &drawctxt->base;
 err:
-	kfree(drawctxt);
-	return ret;
+	kgsl_context_put(&drawctxt->base);
+	return ERR_PTR(ret);
 }
 
 /**
- * adreno_drawctxt_destroy - destroy a draw context
- * @device - KGSL device that owns the context
- * @context- Generic KGSL context container for the context
+ * adreno_drawctxt_sched() - Schedule a previously blocked context
+ * @device: pointer to a KGSL device
+ * @drawctxt: drawctxt to rechedule
  *
- * Destroy an existing context.  Return 0 on success or error
- * code on failure.
+ * This function is called by the core when it knows that a previously blocked
+ * context has been unblocked.  The default adreno response is to reschedule the
+ * context on the dispatcher
  */
+void adreno_drawctxt_sched(struct kgsl_device *device,
+		struct kgsl_context *context)
+{
+	adreno_dispatcher_queue_context(device, ADRENO_CONTEXT(context));
+}
 
-/* destroy a drawing context */
-
-void adreno_drawctxt_destroy(struct kgsl_device *device,
-			  struct kgsl_context *context)
+/**
+ * adreno_drawctxt_detach(): detach a context from the GPU
+ * @context: Generic KGSL context container for the context
+ *
+ */
+int adreno_drawctxt_detach(struct kgsl_context *context)
 {
-	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct kgsl_device *device;
+	struct adreno_device *adreno_dev;
 	struct adreno_context *drawctxt;
+	int ret;
 
-	if (context == NULL || context->devctxt == NULL)
-		return;
+	if (context == NULL)
+		return 0;
+
+	device = context->device;
+	adreno_dev = ADRENO_DEVICE(device);
+	drawctxt = ADRENO_CONTEXT(context);
 
-	drawctxt = context->devctxt;
 	/* deactivate context */
 	if (adreno_dev->drawctxt_active == drawctxt) {
 		/* no need to save GMEM or shader, the context is
@@ -248,18 +510,48 @@ void adreno_drawctxt_destroy(struct kgsl_device *device,
 		adreno_drawctxt_switch(adreno_dev, NULL, 0);
 	}
 
-	if (device->state != KGSL_STATE_HUNG)
-		adreno_idle(device);
+	mutex_lock(&drawctxt->mutex);
+
+	while (drawctxt->cmdqueue_head != drawctxt->cmdqueue_tail) {
+		struct kgsl_cmdbatch *cmdbatch =
+			drawctxt->cmdqueue[drawctxt->cmdqueue_head];
+
+		drawctxt->cmdqueue_head = (drawctxt->cmdqueue_head + 1) %
+			ADRENO_CONTEXT_CMDQUEUE_SIZE;
+
+		mutex_unlock(&drawctxt->mutex);
+
+		/*
+		 * Don't hold the drawctxt mutex while the cmdbatch is being
+		 * destroyed because the cmdbatch destroy takes the device
+		 * mutex and the world falls in on itself
+		 */
+
+		kgsl_cmdbatch_destroy(cmdbatch);
+		mutex_lock(&drawctxt->mutex);
+	}
+
+	mutex_unlock(&drawctxt->mutex);
 
-	if (adreno_is_a20x(adreno_dev) && adreno_dev->drawctxt_active)
-		kgsl_setstate(&device->mmu, adreno_dev->drawctxt_active->id,
-			KGSL_MMUFLAGS_PTUPDATE);
+	/* Wait for the last global timestamp to pass before continuing */
+	ret = adreno_drawctxt_wait_global(adreno_dev, context,
+		drawctxt->internal_timestamp, 10 * 1000);
 
 	kgsl_sharedmem_free(&drawctxt->gpustate);
 	kgsl_sharedmem_free(&drawctxt->context_gmem_shadow.gmemshadow);
 
+	return ret;
+}
+
+
+void adreno_drawctxt_destroy(struct kgsl_context *context)
+{
+	struct adreno_context *drawctxt;
+	if (context == NULL)
+		return;
+
+	drawctxt = ADRENO_CONTEXT(context);
 	kfree(drawctxt);
-	context->devctxt = NULL;
 }
 
 /**
@@ -275,10 +567,12 @@ void adreno_drawctxt_set_bin_base_offset(struct kgsl_device *device,
 				      struct kgsl_context *context,
 				      unsigned int offset)
 {
-	struct adreno_context *drawctxt = context->devctxt;
+	struct adreno_context *drawctxt;
 
-	if (drawctxt)
-		drawctxt->bin_base_offset = offset;
+	if (context == NULL)
+		return;
+	drawctxt = ADRENO_CONTEXT(context);
+	drawctxt->bin_base_offset = offset;
 }
 
 /**
@@ -290,11 +584,12 @@ void adreno_drawctxt_set_bin_base_offset(struct kgsl_device *device,
  * Switch the current draw context
  */
 
-void adreno_drawctxt_switch(struct adreno_device *adreno_dev,
+int adreno_drawctxt_switch(struct adreno_device *adreno_dev,
 				struct adreno_context *drawctxt,
 				unsigned int flags)
 {
 	struct kgsl_device *device = &adreno_dev->dev;
+	int ret = 0;
 
 	if (drawctxt) {
 		if (flags & KGSL_CONTEXT_SAVE_GMEM)
@@ -310,18 +605,44 @@ void adreno_drawctxt_switch(struct adreno_device *adreno_dev,
 	if (adreno_dev->drawctxt_active == drawctxt) {
 		if (adreno_dev->gpudev->ctxt_draw_workaround &&
 			adreno_is_a225(adreno_dev))
-				adreno_dev->gpudev->ctxt_draw_workaround(
+				ret = adreno_dev->gpudev->ctxt_draw_workaround(
 					adreno_dev, drawctxt);
-		return;
+		return ret;
 	}
 
-	KGSL_CTXT_INFO(device, "from %p to %p flags %d\n",
-			adreno_dev->drawctxt_active, drawctxt, flags);
+	KGSL_CTXT_INFO(device, "from %d to %d flags %d\n",
+		adreno_dev->drawctxt_active ?
+		adreno_dev->drawctxt_active->base.id : 0,
+		drawctxt ? drawctxt->base.id : 0, flags);
 
 	/* Save the old context */
-	adreno_dev->gpudev->ctxt_save(adreno_dev, adreno_dev->drawctxt_active);
+	ret = adreno_dev->gpudev->ctxt_save(adreno_dev,
+		adreno_dev->drawctxt_active);
+
+	if (ret) {
+		KGSL_DRV_ERR(device,
+			"Error in GPU context %d save: %d\n",
+			adreno_dev->drawctxt_active->base.id, ret);
+		return ret;
+	}
+
+	/* Put the old instance of the active drawctxt */
+	if (adreno_dev->drawctxt_active)
+		kgsl_context_put(&adreno_dev->drawctxt_active->base);
+
+	/* Get a refcount to the new instance */
+	if (drawctxt)
+		_kgsl_context_get(&drawctxt->base);
 
 	/* Set the new context */
-	adreno_dev->gpudev->ctxt_restore(adreno_dev, drawctxt);
+	ret = adreno_dev->gpudev->ctxt_restore(adreno_dev, drawctxt);
+	if (ret) {
+		KGSL_DRV_ERR(device,
+			"Error in GPU context %d restore: %d\n",
+			drawctxt->base.id, ret);
+		return ret;
+	}
+
 	adreno_dev->drawctxt_active = drawctxt;
+	return 0;
 }
diff --git a/drivers/gpu/msm/adreno_drawctxt.h b/drivers/gpu/msm/adreno_drawctxt.h
index 8bbeaa9b431fc82da0ad20b9d34ad0725bb62b38..dddc20629b753a621f08c2f20a536f17efa9afc4 100644
--- a/drivers/gpu/msm/adreno_drawctxt.h
+++ b/drivers/gpu/msm/adreno_drawctxt.h
@@ -13,8 +13,6 @@
 #ifndef __ADRENO_DRAWCTXT_H
 #define __ADRENO_DRAWCTXT_H
 
-#include <linux/sched.h>
-
 #include "adreno_pm4types.h"
 #include "a2xx_reg.h"
 
@@ -56,6 +54,8 @@
 #define CTXT_FLAGS_SKIP_EOF             BIT(15)
 /* Context no fault tolerance */
 #define CTXT_FLAGS_NO_FAULT_TOLERANCE  BIT(16)
+/* Force the preamble for the next submission */
+#define CTXT_FLAGS_FORCE_PREAMBLE      BIT(17)
 
 /* Symbolic table for the adreno draw context type */
 #define ADRENO_DRAWCTXT_TYPES \
@@ -65,6 +65,13 @@
 	{ KGSL_CONTEXT_TYPE_C2D, "C2D" }, \
 	{ KGSL_CONTEXT_TYPE_RS, "RS" }
 
+#define ADRENO_CONTEXT_CMDQUEUE_SIZE 128
+
+#define ADRENO_CONTEXT_DEFAULT_PRIORITY 1
+
+#define ADRENO_CONTEXT_STATE_ACTIVE 0
+#define ADRENO_CONTEXT_STATE_INVALID 1
+
 struct kgsl_device;
 struct adreno_device;
 struct kgsl_device_private;
@@ -95,21 +102,58 @@ struct gmem_shadow_t {
 	struct kgsl_memdesc quad_vertices_restore;
 };
 
+/**
+ * struct adreno_context - Adreno GPU draw context
+ * @id: Unique integer ID of the context
+ * @timestamp: Last issued context-specific timestamp
+ * @internal_timestamp: Global timestamp of the last issued command
+ * @state: Current state of the context
+ * @flags: Bitfield controlling behavior of the context
+ * @type: Context type (GL, CL, RS)
+ * @mutex: Mutex to protect the cmdqueue
+ * @pagetable: Pointer to the GPU pagetable for the context
+ * @gpustate: Pointer to the GPU scratch memory for context save/restore
+ * @reg_restore: Command buffer for restoring context registers
+ * @shader_save: Command buffer for saving shaders
+ * @shader_restore: Command buffer to restore shaders
+ * @context_gmem_shadow: GMEM shadow structure for save/restore
+ * @reg_save: A2XX command buffer to save context registers
+ * @shader_fixup: A2XX command buffer to "fix" shaders on restore
+ * @chicken_restore: A2XX command buffer to "fix" register restore
+ * @bin_base_offset: Saved value of the A2XX BIN_BASE_OFFSET register
+ * @regconstant_save: A3XX command buffer to save some registers
+ * @constant_retore: A3XX command buffer to restore some registers
+ * @hslqcontrol_restore: A3XX command buffer to restore HSLSQ registers
+ * @save_fixup: A3XX command buffer to "fix" register save
+ * @restore_fixup: A3XX cmmand buffer to restore register save fixes
+ * @shader_load_commands: A3XX GPU memory descriptor for shader load IB
+ * @shader_save_commands: A3XX GPU memory descriptor for shader save IB
+ * @constantr_save_commands: A3XX GPU memory descriptor for constant save IB
+ * @constant_load_commands: A3XX GPU memory descriptor for constant load IB
+ * @cond_execs: A3XX GPU memory descriptor for conditional exec IB
+ * @hlsq_restore_commands: A3XX GPU memory descriptor for HLSQ restore IB
+ * @cmdqueue: Queue of command batches waiting to be dispatched for this context
+ * @cmdqueue_head: Head of the cmdqueue queue
+ * @cmdqueue_tail: Tail of the cmdqueue queue
+ * @pending: Priority list node for the dispatcher list of pending contexts
+ * @wq: Workqueue structure for contexts to sleep pending room in the queue
+ * @waiting: Workqueue structure for contexts waiting for a timestamp or event
+ * @queued: Number of commands queued in the cmdqueue
+ */
 struct adreno_context {
-	pid_t pid;
-	char pid_name[TASK_COMM_LEN];
-	unsigned int id;
+	struct kgsl_context base;
 	unsigned int ib_gpu_time_used;
 	unsigned int timestamp;
+	unsigned int internal_timestamp;
+	int state;
 	uint32_t flags;
 	unsigned int type;
-	struct kgsl_pagetable *pagetable;
+	struct mutex mutex;
 	struct kgsl_memdesc gpustate;
 	unsigned int reg_restore[3];
 	unsigned int shader_save[3];
 	unsigned int shader_restore[3];
 
-	/* Information of the GMEM shadow that is created in context create */
 	struct gmem_shadow_t context_gmem_shadow;
 
 	/* A2XX specific items */
@@ -130,23 +174,44 @@ struct adreno_context {
 	struct kgsl_memdesc constant_load_commands[3];
 	struct kgsl_memdesc cond_execs[4];
 	struct kgsl_memdesc hlsqcontrol_restore_commands[1];
+
+	/* Dispatcher */
+	struct kgsl_cmdbatch *cmdqueue[ADRENO_CONTEXT_CMDQUEUE_SIZE];
+	int cmdqueue_head;
+	int cmdqueue_tail;
+
+	struct plist_node pending;
+	wait_queue_head_t wq;
+	wait_queue_head_t waiting;
+
+	int queued;
 };
 
-int adreno_drawctxt_create(struct kgsl_device *device,
-			struct kgsl_pagetable *pagetable,
-			struct kgsl_context *context,
+
+struct kgsl_context *adreno_drawctxt_create(struct kgsl_device_private *,
 			uint32_t *flags);
 
-void adreno_drawctxt_destroy(struct kgsl_device *device,
-			  struct kgsl_context *context);
+int adreno_drawctxt_detach(struct kgsl_context *context);
+
+void adreno_drawctxt_destroy(struct kgsl_context *context);
 
-void adreno_drawctxt_switch(struct adreno_device *adreno_dev,
+void adreno_drawctxt_sched(struct kgsl_device *device,
+		struct kgsl_context *context);
+
+int adreno_drawctxt_switch(struct adreno_device *adreno_dev,
 				struct adreno_context *drawctxt,
 				unsigned int flags);
 void adreno_drawctxt_set_bin_base_offset(struct kgsl_device *device,
 					struct kgsl_context *context,
 					unsigned int offset);
 
+int adreno_drawctxt_wait(struct adreno_device *adreno_dev,
+		struct kgsl_context *context,
+		uint32_t timestamp, unsigned int timeout);
+
+void adreno_drawctxt_invalidate(struct kgsl_device *device,
+		struct kgsl_context *context);
+
 /* GPU context switch helper functions */
 
 void build_quad_vtxbuff(struct adreno_context *drawctxt,
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
index a6e96e19ced353e9e4901af90d195c719e5148cc..8a166fe92f70ec7e277a9e9072745285bd049494 100644
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -21,6 +21,7 @@
 #include "adreno_ringbuffer.h"
 #include "kgsl_cffdump.h"
 #include "kgsl_pwrctrl.h"
+#include "adreno_trace.h"
 
 #include "a2xx_reg.h"
 #include "a3xx_reg.h"
@@ -724,6 +725,9 @@ int adreno_dump(struct kgsl_device *device, int manual)
 	kgsl_regread(device, REG_CP_IB2_BASE, &cp_ib2_base);
 	kgsl_regread(device, REG_CP_IB2_BUFSZ, &cp_ib2_bufsz);
 
+	trace_adreno_gpu_fault(rbbm_status, cp_rb_rptr, cp_rb_wptr,
+			cp_ib1_base, cp_ib1_bufsz, cp_ib2_base, cp_ib2_bufsz);
+
 	/* If postmortem dump is not enabled, dump minimal set and return */
 	if (!device->pm_dump_enable) {
 
@@ -903,5 +907,9 @@ int adreno_dump(struct kgsl_device *device, int manual)
 error_vfree:
 	vfree(rb_copy);
 end:
+	/* Restart the dispatcher after a manually triggered dump */
+	if (manual)
+		adreno_dispatcher_start(adreno_dev);
+
 	return result;
 }
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 25878a1cbabf7cfc3013b79b19d0ad41403ead16..ef696088eac8ef6216855edfc43f5a453f5d0db1 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -18,7 +18,6 @@
 #include "kgsl.h"
 #include "kgsl_sharedmem.h"
 #include "kgsl_cffdump.h"
-#include "kgsl_trace.h"
 
 #include "adreno.h"
 #include "adreno_pm4types.h"
@@ -65,9 +64,6 @@ adreno_ringbuffer_waitspace(struct adreno_ringbuffer *rb,
 	unsigned long wait_time;
 	unsigned long wait_timeout = msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
 	unsigned long wait_time_part;
-	unsigned int prev_reg_val[ft_detect_regs_count];
-
-	memset(prev_reg_val, 0, sizeof(prev_reg_val));
 
 	/* if wptr ahead, fill the remaining with NOPs */
 	if (wptr_ahead) {
@@ -105,43 +101,13 @@ adreno_ringbuffer_waitspace(struct adreno_ringbuffer *rb,
 		if (freecmds == 0 || freecmds > numcmds)
 			break;
 
-		/* Dont wait for timeout, detect hang faster.
-		 */
-		if (time_after(jiffies, wait_time_part)) {
-			wait_time_part = jiffies +
-				msecs_to_jiffies(KGSL_TIMEOUT_PART);
-			if ((adreno_ft_detect(rb->device,
-						prev_reg_val))){
-				KGSL_DRV_ERR(rb->device,
-				"Hang detected while waiting for freespace in"
-				"ringbuffer rptr: 0x%x, wptr: 0x%x\n",
-				rb->rptr, rb->wptr);
-				goto err;
-			}
-		}
-
 		if (time_after(jiffies, wait_time)) {
 			KGSL_DRV_ERR(rb->device,
 			"Timed out while waiting for freespace in ringbuffer "
 			"rptr: 0x%x, wptr: 0x%x\n", rb->rptr, rb->wptr);
-			goto err;
+			return -ETIMEDOUT;
 		}
 
-		continue;
-
-err:
-		if (!adreno_dump_and_exec_ft(rb->device)) {
-			if (context && context->flags & CTXT_FLAGS_GPU_HANG) {
-				KGSL_CTXT_WARN(rb->device,
-				"Context %p caused a gpu hang. Will not accept commands for context %d\n",
-				context, context->id);
-				return -EDEADLK;
-			}
-			wait_time = jiffies + wait_timeout;
-		} else {
-			/* GPU is hung and fault tolerance failed */
-			BUG();
-		}
 	}
 	return 0;
 }
@@ -179,7 +145,8 @@ unsigned int *adreno_ringbuffer_allocspace(struct adreno_ringbuffer *rb,
 	if (!ret) {
 		ptr = (unsigned int *)rb->buffer_desc.hostptr + rb->wptr;
 		rb->wptr += numcmds;
-	}
+	} else
+		ptr = ERR_PTR(ret);
 
 	return ptr;
 }
@@ -320,10 +287,9 @@ int adreno_ringbuffer_load_pfp_ucode(struct kgsl_device *device)
 	return 0;
 }
 
-int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
+int adreno_ringbuffer_start(struct adreno_ringbuffer *rb)
 {
 	int status;
-	/*cp_rb_cntl_u cp_rb_cntl; */
 	union reg_cp_rb_cntl cp_rb_cntl;
 	unsigned int rb_cntl;
 	struct kgsl_device *device = rb->device;
@@ -332,9 +298,6 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
 	if (rb->flags & KGSL_FLAGS_STARTED)
 		return 0;
 
-	if (init_ram)
-		rb->global_ts = 0;
-
 	kgsl_sharedmem_set(&rb->memptrs_desc, 0, 0,
 			   sizeof(struct kgsl_rbmemptrs));
 
@@ -444,7 +407,9 @@ int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
 	adreno_regwrite(device, REG_CP_ME_CNTL, 0);
 
 	/* ME init is GPU specific, so jump into the sub-function */
-	adreno_dev->gpudev->rb_init(adreno_dev, rb);
+	status = adreno_dev->gpudev->rb_init(adreno_dev, rb);
+	if (status)
+		return status;
 
 	/* idle device to validate ME INIT */
 	status = adreno_idle(device);
@@ -482,6 +447,7 @@ int adreno_ringbuffer_init(struct kgsl_device *device)
 	 */
 	rb->sizedwords = KGSL_RB_SIZE >> 2;
 
+	rb->buffer_desc.flags = KGSL_MEMFLAGS_GPUREADONLY;
 	/* allocate memory for ringbuffer */
 	status = kgsl_allocate_contiguous(&rb->buffer_desc,
 		(rb->sizedwords << 2));
@@ -505,6 +471,8 @@ int adreno_ringbuffer_init(struct kgsl_device *device)
 	/* overlay structure on memptrs memory */
 	rb->memptrs = (struct kgsl_rbmemptrs *) rb->memptrs_desc.hostptr;
 
+	rb->global_ts = 0;
+
 	return 0;
 }
 
@@ -526,9 +494,9 @@ void adreno_ringbuffer_close(struct adreno_ringbuffer *rb)
 
 static int
 adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
-				struct adreno_context *context,
+				struct adreno_context *drawctxt,
 				unsigned int flags, unsigned int *cmds,
-				int sizedwords)
+				int sizedwords, uint32_t timestamp)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(rb->device);
 	unsigned int *ringcmds;
@@ -537,19 +505,20 @@ adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
 	unsigned int rcmd_gpu;
 	unsigned int context_id;
 	unsigned int gpuaddr = rb->device->memstore.gpuaddr;
-	unsigned int timestamp;
 
-	/*
-	 * if the context was not created with per context timestamp
-	 * support, we must use the global timestamp since issueibcmds
-	 * will be returning that one, or if an internal issue then
-	 * use global timestamp.
-	 */
-	if ((context && (context->flags & CTXT_FLAGS_PER_CONTEXT_TS)) &&
-		!(flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE))
-		context_id = context->id;
-	else
+	/* The global timestamp always needs to be incremented */
+	rb->global_ts++;
+
+	/* If this is a internal IB, use the global timestamp for it */
+	if (!drawctxt || (flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
+		timestamp = rb->global_ts;
 		context_id = KGSL_MEMSTORE_GLOBAL;
+	} else {
+		context_id = drawctxt->base.id;
+	}
+
+	if (drawctxt)
+		drawctxt->internal_timestamp = rb->global_ts;
 
 	/* reserve space to temporarily turn off protected mode
 	*  error checking if needed
@@ -560,13 +529,8 @@ adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
 	/* internal ib command identifier for the ringbuffer */
 	total_sizedwords += (flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE) ? 2 : 0;
 
-	/* Add CP_COND_EXEC commands to generate CP_INTERRUPT */
-	total_sizedwords += context ? 13 : 0;
-
-	if ((context) && (context->flags & CTXT_FLAGS_PER_CONTEXT_TS) &&
-		(flags & (KGSL_CMD_FLAGS_INTERNAL_ISSUE |
-		KGSL_CMD_FLAGS_GET_INT)))
-			total_sizedwords += 2;
+	/* Add two dwords for the CP_INTERRUPT */
+	total_sizedwords += drawctxt ? 2 : 0;
 
 	if (adreno_is_a3xx(adreno_dev))
 		total_sizedwords += 7;
@@ -574,16 +538,25 @@ adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
 	if (adreno_is_a2xx(adreno_dev))
 		total_sizedwords += 2; /* CP_WAIT_FOR_IDLE */
 
-	total_sizedwords += 2; /* scratchpad ts for recovery */
 	total_sizedwords += 3; /* sop timestamp */
 	total_sizedwords += 4; /* eop timestamp */
 
-	if (KGSL_MEMSTORE_GLOBAL != context_id)
+	if (drawctxt) {
 		total_sizedwords += 3; /* global timestamp without cache
 					* flush for non-zero context */
+	}
+
+	if (adreno_is_a20x(adreno_dev))
+		total_sizedwords += 2; /* CACHE_FLUSH */
 
-	ringcmds = adreno_ringbuffer_allocspace(rb, context, total_sizedwords);
-	if (!ringcmds)
+	if (flags & KGSL_CMD_FLAGS_WFI)
+		total_sizedwords += 2; /* WFI */
+
+	ringcmds = adreno_ringbuffer_allocspace(rb, drawctxt, total_sizedwords);
+
+	if (IS_ERR(ringcmds))
+		return PTR_ERR(ringcmds);
+	if (ringcmds == NULL)
 		return -ENOSPC;
 
 	rcmd_gpu = rb->buffer_desc.gpuaddr
@@ -597,18 +570,6 @@ adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_CMD_INTERNAL_IDENTIFIER);
 	}
 
-	/* always increment the global timestamp. once. */
-	rb->global_ts++;
-
-	if (KGSL_MEMSTORE_GLOBAL != context_id)
-		timestamp = context->timestamp;
-	else
-		timestamp = rb->global_ts;
-
-	/* scratchpad ts for recovery */
-	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_type0_packet(REG_CP_TIMESTAMP, 1));
-	GSL_RB_WRITE(ringcmds, rcmd_gpu, rb->global_ts);
-
 	/* start-of-pipeline timestamp */
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_type3_packet(CP_MEM_WRITE, 2));
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
@@ -669,63 +630,21 @@ adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
 		KGSL_MEMSTORE_OFFSET(context_id, eoptimestamp)));
 	GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
 
-	if (KGSL_MEMSTORE_GLOBAL != context_id) {
+	if (drawctxt) {
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_MEM_WRITE, 2));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-		KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
-			eoptimestamp)));
+			KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL,
+				eoptimestamp)));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, rb->global_ts);
 	}
-	if (context) {
-		/* Conditional execution based on memory values */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_COND_EXEC, 4));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			KGSL_MEMSTORE_OFFSET(
-				context_id, ts_cmp_enable)) >> 2);
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, (gpuaddr +
-			KGSL_MEMSTORE_OFFSET(
-				context_id, ref_wait_ts)) >> 2);
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, timestamp);
-		/* # of conditional command DWORDs */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, 8);
-
-		/* Clear the ts_cmp_enable for the context */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_MEM_WRITE, 2));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, gpuaddr +
-			KGSL_MEMSTORE_OFFSET(
-				context_id, ts_cmp_enable));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x0);
-
-		/* Clear the ts_cmp_enable for the global timestamp */
-		GSL_RB_WRITE(ringcmds, rcmd_gpu,
-			cp_type3_packet(CP_MEM_WRITE, 2));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, gpuaddr +
-			KGSL_MEMSTORE_OFFSET(
-				KGSL_MEMSTORE_GLOBAL, ts_cmp_enable));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x0);
 
-		/* Trigger the interrupt */
+	if (drawctxt || (flags & KGSL_CMD_FLAGS_INTERNAL_ISSUE)) {
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
 			cp_type3_packet(CP_INTERRUPT, 1));
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, CP_INT_CNTL__RB_INT_MASK);
 	}
 
-	/*
-	 * If per context timestamps are enabled and any of the kgsl
-	 * internal commands want INT to be generated trigger the INT
-	*/
-	if ((context) && (context->flags & CTXT_FLAGS_PER_CONTEXT_TS) &&
-		(flags & (KGSL_CMD_FLAGS_INTERNAL_ISSUE |
-		KGSL_CMD_FLAGS_GET_INT))) {
-			GSL_RB_WRITE(ringcmds, rcmd_gpu,
-				cp_type3_packet(CP_INTERRUPT, 1));
-			GSL_RB_WRITE(ringcmds, rcmd_gpu,
-				CP_INT_CNTL__RB_INT_MASK);
-	}
-
 	if (adreno_is_a3xx(adreno_dev)) {
 		/* Dummy set-constant to trigger context rollover */
 		GSL_RB_WRITE(ringcmds, rcmd_gpu,
@@ -735,9 +654,10 @@ adreno_ringbuffer_addcmds(struct adreno_ringbuffer *rb,
 		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0);
 	}
 
-	if (flags & KGSL_CMD_FLAGS_EOF) {
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, cp_nop_packet(1));
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, KGSL_END_OF_FRAME_IDENTIFIER);
+	if (flags & KGSL_CMD_FLAGS_WFI) {
+		GSL_RB_WRITE(ringcmds, rcmd_gpu,
+			cp_type3_packet(CP_WAIT_FOR_IDLE, 1));
+		GSL_RB_WRITE(ringcmds, rcmd_gpu, 0x00000000);
 	}
 
 	adreno_ringbuffer_submit(rb);
@@ -755,14 +675,10 @@ adreno_ringbuffer_issuecmds(struct kgsl_device *device,
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
 	struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
 
-	if (device->state & KGSL_STATE_HUNG)
-		return kgsl_readtimestamp(device, KGSL_MEMSTORE_GLOBAL,
-					KGSL_TIMESTAMP_RETIRED);
-
 	flags |= KGSL_CMD_FLAGS_INTERNAL_ISSUE;
 
 	return adreno_ringbuffer_addcmds(rb, drawctxt, flags, cmds,
-							sizedwords);
+		sizedwords, 0);
 }
 
 static bool _parse_ibs(struct kgsl_device_private *dev_priv, uint gpuaddr,
@@ -957,50 +873,108 @@ done:
 	return ret;
 }
 
+/**
+ * _ringbuffer_verify_ib() - parse an IB and verify that it is correct
+ * @dev_priv: Pointer to the process struct
+ * @ibdesc: Pointer to the IB descriptor
+ *
+ * This function only gets called if debugging is enabled  - it walks the IB and
+ * does additional level parsing and verification above and beyond what KGSL
+ * core does
+ */
+static inline bool _ringbuffer_verify_ib(struct kgsl_device_private *dev_priv,
+		struct kgsl_ibdesc *ibdesc)
+{
+	struct kgsl_device *device = dev_priv->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+	/* Check that the size of the IBs is under the allowable limit */
+	if (ibdesc->sizedwords == 0 || ibdesc->sizedwords > 0xFFFFF) {
+		KGSL_DRV_ERR(device, "Invalid IB size 0x%X\n",
+				ibdesc->sizedwords);
+		return false;
+	}
+
+	if (unlikely(adreno_dev->ib_check_level >= 1) &&
+		!_parse_ibs(dev_priv, ibdesc->gpuaddr, ibdesc->sizedwords)) {
+		KGSL_DRV_ERR(device, "Could not verify the IBs\n");
+		return false;
+	}
+
+	return true;
+}
+
 int
 adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 				struct kgsl_context *context,
-				struct kgsl_ibdesc *ibdesc,
-				unsigned int numibs,
-				uint32_t *timestamp,
-				unsigned int flags)
+				struct kgsl_cmdbatch *cmdbatch,
+				uint32_t *timestamp)
 {
 	struct kgsl_device *device = dev_priv->device;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	unsigned int *link = 0;
+	struct adreno_context *drawctxt = ADRENO_CONTEXT(context);
+	int i, ret;
+
+	if (drawctxt->state == ADRENO_CONTEXT_STATE_INVALID)
+		return -EDEADLK;
+
+	/* Verify the IBs before they get queued */
+
+	for (i = 0; i < cmdbatch->ibcount; i++) {
+		if (!_ringbuffer_verify_ib(dev_priv, &cmdbatch->ibdesc[i]))
+			return -EINVAL;
+	}
+
+	/* Queue the command in the ringbuffer */
+	ret = adreno_context_queue_cmd(adreno_dev, drawctxt, cmdbatch,
+		timestamp);
+
+	if (ret)
+		KGSL_DRV_ERR(device, "adreno_context_queue_cmd returned %d\n",
+				ret);
+
+	return ret;
+}
+
+/* adreno_rindbuffer_submitcmd - submit userspace IBs to the GPU */
+int adreno_ringbuffer_submitcmd(struct adreno_device *adreno_dev,
+		struct kgsl_cmdbatch *cmdbatch)
+{
+	struct kgsl_device *device = &adreno_dev->dev;
+	struct kgsl_ibdesc *ibdesc;
+	unsigned int numibs;
+	unsigned int *link;
 	unsigned int *cmds;
 	unsigned int i;
-	struct adreno_context *drawctxt = NULL;
+	struct kgsl_context *context;
+	struct adreno_context *drawctxt;
 	unsigned int start_index = 0;
-	int ret = 0;
+	int ret;
 
-	if (device->state & KGSL_STATE_HUNG) {
-		ret = -EBUSY;
-		goto done;
-	}
+	context = cmdbatch->context;
+	drawctxt = ADRENO_CONTEXT(context);
 
-	if (!(adreno_dev->ringbuffer.flags & KGSL_FLAGS_STARTED) ||
-	      context == NULL || ibdesc == 0 || numibs == 0) {
-		ret = -EINVAL;
-		goto done;
-	}
-	drawctxt = context->devctxt;
+	ibdesc = cmdbatch->ibdesc;
+	numibs = cmdbatch->ibcount;
 
-	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG) {
-		KGSL_CTXT_ERR(device, "proc %s failed fault tolerance"
-			" will not accept commands for context %d\n",
-			drawctxt->pid_name, drawctxt->id);
-		ret = -EDEADLK;
-		goto done;
-	}
+	/*When preamble is enabled, the preamble buffer with state restoration
+	commands are stored in the first node of the IB chain. We can skip that
+	if a context switch hasn't occured */
+
+	if ((drawctxt->flags & CTXT_FLAGS_PREAMBLE) &&
+		!(cmdbatch->priv & CMDBATCH_FLAG_FORCE_PREAMBLE) &&
+		(adreno_dev->drawctxt_active == drawctxt))
+		start_index = 1;
+
+	/*
+	 * In skip mode don't issue the draw IBs but keep all the other
+	 * accoutrements of a submision (including the interrupt) to keep
+	 * the accounting sane. Set start_index and numibs to 0 to just
+	 * generate the start and end markers and skip everything else
+	 */
 
-	if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
-		KGSL_CTXT_ERR(device,
-			"proc %s triggered fault tolerance"
-			" skipping commands for context till EOF %d\n",
-			drawctxt->pid_name, drawctxt->id);
-		if (flags & KGSL_CMD_FLAGS_EOF)
-			drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
+	if (cmdbatch->priv & CMDBATCH_FLAG_SKIP) {
+		start_index = 0;
 		numibs = 0;
 	}
 
@@ -1011,14 +985,6 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 		goto done;
 	}
 
-	/*When preamble is enabled, the preamble buffer with state restoration
-	commands are stored in the first node of the IB chain. We can skip that
-	if a context switch hasn't occured */
-
-	if (drawctxt->flags & CTXT_FLAGS_PREAMBLE &&
-		adreno_dev->drawctxt_active == drawctxt)
-		start_index = 1;
-
 	if (!start_index) {
 		*cmds++ = cp_nop_packet(1);
 		*cmds++ = KGSL_START_OF_IB_IDENTIFIER;
@@ -1030,19 +996,17 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 		*cmds++ = ibdesc[0].sizedwords;
 	}
 	for (i = start_index; i < numibs; i++) {
-		if (unlikely(adreno_dev->ib_check_level >= 1 &&
-		    !_parse_ibs(dev_priv, ibdesc[i].gpuaddr,
-				ibdesc[i].sizedwords))) {
-			ret = -EINVAL;
-			goto done;
-		}
 
-		if (ibdesc[i].sizedwords == 0) {
-			ret = -EINVAL;
-			goto done;
-		}
+		/*
+		 * Skip 0 sized IBs - these are presumed to have been removed
+		 * from consideration by the FT policy
+		 */
+
+		if (ibdesc[i].sizedwords == 0)
+			*cmds++ = cp_nop_packet(2);
+		else
+			*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
 
-		*cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
 		*cmds++ = ibdesc[i].gpuaddr;
 		*cmds++ = ibdesc[i].sizedwords;
 	}
@@ -1050,36 +1014,27 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 	*cmds++ = cp_nop_packet(1);
 	*cmds++ = KGSL_END_OF_IB_IDENTIFIER;
 
-	kgsl_setstate(&device->mmu, context->id,
+	ret = kgsl_setstate(&device->mmu, context->id,
 		      kgsl_mmu_pt_get_flags(device->mmu.hwpagetable,
 					device->id));
 
-	adreno_drawctxt_switch(adreno_dev, drawctxt, flags);
+	if (ret)
+		goto done;
 
-	if (drawctxt->flags & CTXT_FLAGS_USER_GENERATED_TS) {
-		if (timestamp_cmp(drawctxt->timestamp, *timestamp) >= 0) {
-			KGSL_DRV_ERR(device,
-				"Invalid user generated ts <%d:0x%x>, "
-				"less than last issued ts <%d:0x%x>\n",
-				drawctxt->id, *timestamp, drawctxt->id,
-				drawctxt->timestamp);
-			return -ERANGE;
-		}
-		drawctxt->timestamp = *timestamp;
-	} else
-		drawctxt->timestamp++;
+	ret = adreno_drawctxt_switch(adreno_dev, drawctxt, cmdbatch->flags);
 
-	ret = adreno_ringbuffer_addcmds(&adreno_dev->ringbuffer,
-					drawctxt,
-					(flags & KGSL_CMD_FLAGS_EOF),
-					&link[0], (cmds - link));
+	/*
+	 * In the unlikely event of an error in the drawctxt switch,
+	 * treat it like a hang
+	 */
 	if (ret)
 		goto done;
 
-	if (drawctxt->flags & CTXT_FLAGS_PER_CONTEXT_TS)
-		*timestamp = drawctxt->timestamp;
-	else
-		*timestamp = adreno_dev->ringbuffer.global_ts;
+	ret = adreno_ringbuffer_addcmds(&adreno_dev->ringbuffer,
+					drawctxt,
+					cmdbatch->flags,
+					&link[0], (cmds - link),
+					cmdbatch->timestamp);
 
 #ifdef CONFIG_MSM_KGSL_CFF_DUMP
 	/*
@@ -1090,209 +1045,11 @@ adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 	adreno_idle(device);
 #endif
 
-	/*
-	 * If context hung and recovered then return error so that the
-	 * application may handle it
-	 */
-	if (drawctxt->flags & CTXT_FLAGS_GPU_HANG_FT) {
-		drawctxt->flags &= ~CTXT_FLAGS_GPU_HANG_FT;
-		ret = -EPROTO;
-	}
-
 done:
-	trace_kgsl_issueibcmds(device, context->id, ibdesc, numibs,
-		*timestamp, flags, ret, drawctxt->type);
+	kgsl_trace_issueibcmds(device, context->id, cmdbatch,
+		cmdbatch->timestamp, cmdbatch->flags, ret,
+		drawctxt->type);
 
 	kfree(link);
 	return ret;
 }
-
-static void _turn_preamble_on_for_ib_seq(struct adreno_ringbuffer *rb,
-				unsigned int rb_rptr)
-{
-	unsigned int temp_rb_rptr = rb_rptr;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int val[2];
-	int i = 0;
-	bool check = false;
-	bool cmd_start = false;
-
-	/* Go till the start of the ib sequence and turn on preamble */
-	while (temp_rb_rptr / sizeof(unsigned int) != rb->wptr) {
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val[i], temp_rb_rptr);
-		if (check && KGSL_START_OF_IB_IDENTIFIER == val[i]) {
-			/* decrement i */
-			i = (i + 1) % 2;
-			if (val[i] == cp_nop_packet(4)) {
-				temp_rb_rptr = adreno_ringbuffer_dec_wrapped(
-						temp_rb_rptr, size);
-				kgsl_sharedmem_writel(&rb->buffer_desc,
-					temp_rb_rptr, cp_nop_packet(1));
-			}
-			KGSL_FT_INFO(rb->device,
-			"Turned preamble on at offset 0x%x\n",
-			temp_rb_rptr / 4);
-			break;
-		}
-		/* If you reach beginning of next command sequence then exit
-		 * First command encountered is the current one so don't break
-		 * on that. */
-		if (KGSL_CMD_IDENTIFIER == val[i]) {
-			if (cmd_start)
-				break;
-			cmd_start = true;
-		}
-
-		i = (i + 1) % 2;
-		if (1 == i)
-			check = true;
-		temp_rb_rptr = adreno_ringbuffer_inc_wrapped(temp_rb_rptr,
-								size);
-	}
-}
-
-void adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
-				struct adreno_ft_data *ft_data)
-{
-	struct kgsl_device *device = rb->device;
-	unsigned int rb_rptr = ft_data->start_of_replay_cmds;
-	unsigned int good_rb_idx = 0, bad_rb_idx = 0, temp_rb_idx = 0;
-	unsigned int last_good_cmd_end_idx = 0, last_bad_cmd_end_idx = 0;
-	unsigned int cmd_start_idx = 0;
-	unsigned int val1 = 0;
-	int copy_rb_contents = 0;
-	unsigned int temp_rb_rptr;
-	struct kgsl_context *k_ctxt;
-	struct adreno_context *a_ctxt;
-	unsigned int size = rb->buffer_desc.size;
-	unsigned int *temp_rb_buffer = ft_data->rb_buffer;
-	int *rb_size = &ft_data->rb_size;
-	unsigned int *bad_rb_buffer = ft_data->bad_rb_buffer;
-	int *bad_rb_size = &ft_data->bad_rb_size;
-	unsigned int *good_rb_buffer = ft_data->good_rb_buffer;
-	int *good_rb_size = &ft_data->good_rb_size;
-
-	/*
-	 * If the start index from where commands need to be copied is invalid
-	 * then no need to save off any commands
-	 */
-	if (0xFFFFFFFF == ft_data->start_of_replay_cmds)
-		return;
-
-	k_ctxt = kgsl_context_get(device, ft_data->context_id);
-
-	if (k_ctxt) {
-		a_ctxt = k_ctxt->devctxt;
-		if (a_ctxt->flags & CTXT_FLAGS_PREAMBLE)
-			_turn_preamble_on_for_ib_seq(rb, rb_rptr);
-		kgsl_context_put(k_ctxt);
-	}
-	k_ctxt = NULL;
-
-	/* Walk the rb from the context switch. Omit any commands
-	 * for an invalid context. */
-	while ((rb_rptr / sizeof(unsigned int)) != rb->wptr) {
-		kgsl_sharedmem_readl(&rb->buffer_desc, &val1, rb_rptr);
-
-		if (KGSL_CMD_IDENTIFIER == val1) {
-			/* Start is the NOP dword that comes before
-			 * KGSL_CMD_IDENTIFIER */
-			cmd_start_idx = temp_rb_idx - 1;
-			if ((copy_rb_contents) && (good_rb_idx))
-				last_good_cmd_end_idx = good_rb_idx - 1;
-			if ((!copy_rb_contents) && (bad_rb_idx))
-				last_bad_cmd_end_idx = bad_rb_idx - 1;
-		}
-
-		/* check for context switch indicator */
-		if (val1 == KGSL_CONTEXT_TO_MEM_IDENTIFIER) {
-			unsigned int temp_idx, val2;
-			/* increment by 3 to get to the context_id */
-			temp_rb_rptr = rb_rptr + (3 * sizeof(unsigned int)) %
-					size;
-			kgsl_sharedmem_readl(&rb->buffer_desc, &val2,
-						temp_rb_rptr);
-
-			/* if context switches to a context that did not cause
-			 * hang then start saving the rb contents as those
-			 * commands can be executed */
-			k_ctxt = kgsl_context_get(rb->device, val2);
-
-			if (k_ctxt) {
-				a_ctxt = k_ctxt->devctxt;
-
-			/* If we are changing to a good context and were not
-			 * copying commands then copy over commands to the good
-			 * context */
-			if (!copy_rb_contents && ((k_ctxt &&
-				!(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) ||
-				!k_ctxt)) {
-				for (temp_idx = cmd_start_idx;
-					temp_idx < temp_rb_idx;
-					temp_idx++)
-					good_rb_buffer[good_rb_idx++] =
-						temp_rb_buffer[temp_idx];
-				ft_data->last_valid_ctx_id = val2;
-				copy_rb_contents = 1;
-				/* remove the good commands from bad buffer */
-				bad_rb_idx = last_bad_cmd_end_idx;
-			} else if (copy_rb_contents && k_ctxt &&
-				(a_ctxt->flags & CTXT_FLAGS_GPU_HANG)) {
-
-				/* If we are changing back to a bad context
-				 * from good ctxt and were not copying commands
-				 * to bad ctxt then copy over commands to
-				 * the bad context */
-				for (temp_idx = cmd_start_idx;
-					temp_idx < temp_rb_idx;
-					temp_idx++)
-					bad_rb_buffer[bad_rb_idx++] =
-						temp_rb_buffer[temp_idx];
-				/* If we are changing to bad context then
-				 * remove the dwords we copied for this
-				 * sequence from the good buffer */
-				good_rb_idx = last_good_cmd_end_idx;
-				copy_rb_contents = 0;
-			}
-			}
-			kgsl_context_put(k_ctxt);
-		}
-
-		if (copy_rb_contents)
-			good_rb_buffer[good_rb_idx++] = val1;
-		else
-			bad_rb_buffer[bad_rb_idx++] = val1;
-
-		/* Copy both good and bad commands to temp buffer */
-		temp_rb_buffer[temp_rb_idx++] = val1;
-
-		rb_rptr = adreno_ringbuffer_inc_wrapped(rb_rptr, size);
-	}
-	*good_rb_size = good_rb_idx;
-	*bad_rb_size = bad_rb_idx;
-	*rb_size = temp_rb_idx;
-}
-
-void
-adreno_ringbuffer_restore(struct adreno_ringbuffer *rb, unsigned int *rb_buff,
-			int num_rb_contents)
-{
-	int i;
-	unsigned int *ringcmds;
-	unsigned int rcmd_gpu;
-
-	if (!num_rb_contents)
-		return;
-
-	if (num_rb_contents > (rb->buffer_desc.size - rb->wptr)) {
-		adreno_regwrite(rb->device, REG_CP_RB_RPTR, 0);
-		rb->rptr = 0;
-		BUG_ON(num_rb_contents > rb->buffer_desc.size);
-	}
-	ringcmds = (unsigned int *)rb->buffer_desc.hostptr + rb->wptr;
-	rcmd_gpu = rb->buffer_desc.gpuaddr + sizeof(unsigned int) * rb->wptr;
-	for (i = 0; i < num_rb_contents; i++)
-		GSL_RB_WRITE(ringcmds, rcmd_gpu, rb_buff[i]);
-	rb->wptr += num_rb_contents;
-	adreno_ringbuffer_submit(rb);
-}
diff --git a/drivers/gpu/msm/adreno_ringbuffer.h b/drivers/gpu/msm/adreno_ringbuffer.h
index 3157f41432551f10af65398f2dcf9f333712877f..d7a774093354b0848dac638102dee0202515545a 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.h
+++ b/drivers/gpu/msm/adreno_ringbuffer.h
@@ -27,7 +27,6 @@
 
 struct kgsl_device;
 struct kgsl_device_private;
-struct adreno_ft_data;
 
 #define GSL_RB_MEMPTRS_SCRATCH_COUNT	 8
 struct kgsl_rbmemptrs {
@@ -90,15 +89,15 @@ struct adreno_ringbuffer {
 
 int adreno_ringbuffer_issueibcmds(struct kgsl_device_private *dev_priv,
 				struct kgsl_context *context,
-				struct kgsl_ibdesc *ibdesc,
-				unsigned int numibs,
-				uint32_t *timestamp,
-				unsigned int flags);
+				struct kgsl_cmdbatch *cmdbatch,
+				uint32_t *timestamp);
+
+int adreno_ringbuffer_submitcmd(struct adreno_device *adreno_dev,
+		struct kgsl_cmdbatch *cmdbatch);
 
 int adreno_ringbuffer_init(struct kgsl_device *device);
 
-int adreno_ringbuffer_start(struct adreno_ringbuffer *rb,
-				unsigned int init_ram);
+int adreno_ringbuffer_start(struct adreno_ringbuffer *rb);
 
 void adreno_ringbuffer_stop(struct adreno_ringbuffer *rb);
 
@@ -114,13 +113,6 @@ void adreno_ringbuffer_submit(struct adreno_ringbuffer *rb);
 
 void kgsl_cp_intrcallback(struct kgsl_device *device);
 
-void adreno_ringbuffer_extract(struct adreno_ringbuffer *rb,
-				struct adreno_ft_data *ft_data);
-
-void
-adreno_ringbuffer_restore(struct adreno_ringbuffer *rb, unsigned int *rb_buff,
-			int num_rb_contents);
-
 unsigned int *adreno_ringbuffer_allocspace(struct adreno_ringbuffer *rb,
 						struct adreno_context *context,
 						unsigned int numcmds);
diff --git a/drivers/gpu/msm/adreno_snapshot.c b/drivers/gpu/msm/adreno_snapshot.c
index 893cfa61bdced43795e86363c6f8f4ee4329521b..3bcbd580f36df670c7fab133cd5276165793bbf6 100644
--- a/drivers/gpu/msm/adreno_snapshot.c
+++ b/drivers/gpu/msm/adreno_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -161,6 +161,12 @@ static unsigned int vfd_control_0;
 static unsigned int sp_vs_pvt_mem_addr;
 static unsigned int sp_fs_pvt_mem_addr;
 
+/*
+ * Cached value of SP_VS_OBJ_START_REG and SP_FS_OBJ_START_REG.
+ */
+static unsigned int sp_vs_obj_start_reg;
+static unsigned int sp_fs_obj_start_reg;
+
 /*
  * Each load state block has two possible types.  Each type has a different
  * number of dwords per unit.  Use this handy lookup table to make sure
@@ -373,6 +379,26 @@ static int ib_parse_draw_indx(struct kgsl_device *device, unsigned int *pkt,
 		sp_fs_pvt_mem_addr = 0;
 	}
 
+	if (sp_vs_obj_start_reg) {
+		ret = kgsl_snapshot_get_object(device, ptbase,
+			sp_vs_obj_start_reg & 0xFFFFFFE0, 0,
+			SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+		snapshot_frozen_objsize += ret;
+		sp_vs_obj_start_reg = 0;
+	}
+
+	if (sp_fs_obj_start_reg) {
+		ret = kgsl_snapshot_get_object(device, ptbase,
+			sp_fs_obj_start_reg & 0xFFFFFFE0, 0,
+			SNAPSHOT_GPU_OBJECT_GENERIC);
+		if (ret < 0)
+			return -EINVAL;
+		snapshot_frozen_objsize += ret;
+		sp_fs_obj_start_reg = 0;
+	}
+
 	/* Finally: VBOs */
 
 	/* The number of active VBOs is stored in VFD_CONTROL_O[31:27] */
@@ -444,7 +470,7 @@ static void ib_parse_type0(struct kgsl_device *device, unsigned int *ptr,
 	int offset = type0_pkt_offset(*ptr);
 	int i;
 
-	for (i = 0; i < size; i++, offset++) {
+	for (i = 0; i < size - 1; i++, offset++) {
 
 		/* Visiblity stream buffer */
 
@@ -505,11 +531,20 @@ static void ib_parse_type0(struct kgsl_device *device, unsigned int *ptr,
 			case A3XX_SP_FS_PVT_MEM_ADDR_REG:
 				sp_fs_pvt_mem_addr = ptr[i + 1];
 				break;
+			case A3XX_SP_VS_OBJ_START_REG:
+				sp_vs_obj_start_reg = ptr[i + 1];
+				break;
+			case A3XX_SP_FS_OBJ_START_REG:
+				sp_fs_obj_start_reg = ptr[i + 1];
+				break;
 			}
 		}
 	}
 }
 
+static inline int parse_ib(struct kgsl_device *device, unsigned int ptbase,
+		unsigned int gpuaddr, unsigned int dwords);
+
 /* Add an IB as a GPU object, but first, parse it to find more goodies within */
 
 static int ib_add_gpu_object(struct kgsl_device *device, unsigned int ptbase,
@@ -549,32 +584,12 @@ static int ib_add_gpu_object(struct kgsl_device *device, unsigned int ptbase,
 			if (adreno_cmd_is_ib(src[i])) {
 				unsigned int gpuaddr = src[i + 1];
 				unsigned int size = src[i + 2];
-				unsigned int ibbase;
-
-				/* Address of the last processed IB2 */
-				kgsl_regread(device, REG_CP_IB2_BASE, &ibbase);
 
-				/*
-				 * If this is the last IB2 that was executed,
-				 * then push it to make sure it goes into the
-				 * static space
-				 */
+				ret = parse_ib(device, ptbase, gpuaddr, size);
 
-				if (ibbase == gpuaddr)
-					push_object(device,
-						SNAPSHOT_OBJ_TYPE_IB, ptbase,
-						gpuaddr, size);
-				else {
-					ret = ib_add_gpu_object(device,
-						ptbase, gpuaddr, size);
-
-					/*
-					 * If adding the IB failed then stop
-					 * parsing
-					 */
-					if (ret < 0)
-						goto done;
-				}
+				/* If adding the IB failed then stop parsing */
+				if (ret < 0)
+					goto done;
 			} else {
 				ret = ib_parse_type3(device, &src[i], ptbase);
 				/*
@@ -604,6 +619,36 @@ done:
 	return ret;
 }
 
+/*
+ * We want to store the last executed IB1 and IB2 in the static region to ensure
+ * that we get at least some information out of the snapshot even if we can't
+ * access the dynamic data from the sysfs file.  Push all other IBs on the
+ * dynamic list
+ */
+static inline int parse_ib(struct kgsl_device *device, unsigned int ptbase,
+		unsigned int gpuaddr, unsigned int dwords)
+{
+	unsigned int ib1base, ib2base;
+	int ret = 0;
+
+	/*
+	 * Check the IB address - if it is either the last executed IB1 or the
+	 * last executed IB2 then push it into the static blob otherwise put
+	 * it in the dynamic list
+	 */
+
+	kgsl_regread(device, REG_CP_IB1_BASE, &ib1base);
+	kgsl_regread(device, REG_CP_IB2_BASE, &ib2base);
+
+	if (gpuaddr == ib1base || gpuaddr == ib2base)
+		push_object(device, SNAPSHOT_OBJ_TYPE_IB, ptbase,
+			gpuaddr, dwords);
+	else
+		ret = ib_add_gpu_object(device, ptbase, gpuaddr, dwords);
+
+	return ret;
+}
+
 /* Snapshot the ringbuffer memory */
 static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 	int remain, void *priv)
@@ -740,13 +785,13 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 
 			struct kgsl_memdesc *memdesc =
 				adreno_find_ctxtmem(device, ptbase, ibaddr,
-					ibsize);
+					ibsize << 2);
 
 			/* IOMMU uses a NOP IB placed in setsate memory */
 			if (NULL == memdesc)
 				if (kgsl_gpuaddr_in_memdesc(
 						&device->mmu.setstate_memory,
-						ibaddr, ibsize))
+						ibaddr, ibsize << 2))
 					memdesc = &device->mmu.setstate_memory;
 			/*
 			 * The IB from CP_IB1_BASE and the IBs for legacy
@@ -754,12 +799,11 @@ static int snapshot_rb(struct kgsl_device *device, void *snapshot,
 			 * others get marked at GPU objects
 			 */
 
-			if (ibaddr == ibbase || memdesc != NULL)
+			if (memdesc != NULL)
 				push_object(device, SNAPSHOT_OBJ_TYPE_IB,
 					ptbase, ibaddr, ibsize);
 			else
-				ib_add_gpu_object(device, ptbase, ibaddr,
-					ibsize);
+				parse_ib(device, ptbase, ibaddr, ibsize);
 		}
 
 		index = index + 1;
@@ -804,15 +848,14 @@ static int snapshot_ib(struct kgsl_device *device, void *snapshot,
 				continue;
 
 			if (adreno_cmd_is_ib(*src))
-				push_object(device, SNAPSHOT_OBJ_TYPE_IB,
-					obj->ptbase, src[1], src[2]);
-			else {
+				ret = parse_ib(device, obj->ptbase, src[1],
+					src[2]);
+			else
 				ret = ib_parse_type3(device, src, obj->ptbase);
 
-				/* Stop parsing if the type3 decode fails */
-				if (ret < 0)
-					break;
-			}
+			/* Stop parsing if the type3 decode fails */
+			if (ret < 0)
+				break;
 		}
 	}
 
diff --git a/drivers/gpu/msm/adreno_trace.c b/drivers/gpu/msm/adreno_trace.c
new file mode 100644
index 0000000000000000000000000000000000000000..607ba8c4afa54ef834cf8aaff04f4884269e4d63
--- /dev/null
+++ b/drivers/gpu/msm/adreno_trace.c
@@ -0,0 +1,18 @@
+/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "adreno.h"
+
+/* Instantiate tracepoints */
+#define CREATE_TRACE_POINTS
+#include "adreno_trace.h"
diff --git a/drivers/gpu/msm/adreno_trace.h b/drivers/gpu/msm/adreno_trace.h
new file mode 100644
index 0000000000000000000000000000000000000000..8993afb3764e6f46445dcafb88f11dfc9b87eb03
--- /dev/null
+++ b/drivers/gpu/msm/adreno_trace.h
@@ -0,0 +1,169 @@
+/* Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#if !defined(_ADRENO_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _ADRENO_TRACE_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kgsl
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE adreno_trace
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(adreno_cmdbatch_queued,
+	TP_PROTO(struct kgsl_cmdbatch *cmdbatch, unsigned int queued),
+	TP_ARGS(cmdbatch, queued),
+	TP_STRUCT__entry(
+		__field(unsigned int, id)
+		__field(unsigned int, timestamp)
+		__field(unsigned int, queued)
+	),
+	TP_fast_assign(
+		__entry->id = cmdbatch->context->id;
+		__entry->timestamp = cmdbatch->timestamp;
+		__entry->queued = queued;
+	),
+	TP_printk(
+		"ctx=%u ts=%u queued=%u",
+			__entry->id, __entry->timestamp, __entry->queued
+	)
+);
+
+DECLARE_EVENT_CLASS(adreno_cmdbatch_template,
+	TP_PROTO(struct kgsl_cmdbatch *cmdbatch, int inflight),
+	TP_ARGS(cmdbatch, inflight),
+	TP_STRUCT__entry(
+		__field(unsigned int, id)
+		__field(unsigned int, timestamp)
+		__field(unsigned int, inflight)
+	),
+	TP_fast_assign(
+		__entry->id = cmdbatch->context->id;
+		__entry->timestamp = cmdbatch->timestamp;
+		__entry->inflight = inflight;
+	),
+	TP_printk(
+		"ctx=%u ts=%u inflight=%u",
+			__entry->id, __entry->timestamp,
+			__entry->inflight
+	)
+);
+
+DEFINE_EVENT(adreno_cmdbatch_template, adreno_cmdbatch_retired,
+	TP_PROTO(struct kgsl_cmdbatch *cmdbatch, int inflight),
+	TP_ARGS(cmdbatch, inflight)
+);
+
+DEFINE_EVENT(adreno_cmdbatch_template, adreno_cmdbatch_submitted,
+	TP_PROTO(struct kgsl_cmdbatch *cmdbatch, int inflight),
+	TP_ARGS(cmdbatch, inflight)
+);
+
+DECLARE_EVENT_CLASS(adreno_drawctxt_template,
+	TP_PROTO(struct adreno_context *drawctxt),
+	TP_ARGS(drawctxt),
+	TP_STRUCT__entry(
+		__field(unsigned int, id)
+	),
+	TP_fast_assign(
+		__entry->id = drawctxt->base.id;
+	),
+	TP_printk("ctx=%u", __entry->id)
+);
+
+DEFINE_EVENT(adreno_drawctxt_template, adreno_context_sleep,
+	TP_PROTO(struct adreno_context *drawctxt),
+	TP_ARGS(drawctxt)
+);
+
+DEFINE_EVENT(adreno_drawctxt_template, adreno_context_wake,
+	TP_PROTO(struct adreno_context *drawctxt),
+	TP_ARGS(drawctxt)
+);
+
+DEFINE_EVENT(adreno_drawctxt_template, dispatch_queue_context,
+	TP_PROTO(struct adreno_context *drawctxt),
+	TP_ARGS(drawctxt)
+);
+
+TRACE_EVENT(adreno_drawctxt_wait_start,
+	TP_PROTO(unsigned int id, unsigned int ts),
+	TP_ARGS(id, ts),
+	TP_STRUCT__entry(
+		__field(unsigned int, id)
+		__field(unsigned int, ts)
+	),
+	TP_fast_assign(
+		__entry->id = id;
+		__entry->ts = ts;
+	),
+	TP_printk(
+		"ctx=%u ts=%u",
+			__entry->id, __entry->ts
+	)
+);
+
+TRACE_EVENT(adreno_drawctxt_wait_done,
+	TP_PROTO(unsigned int id, unsigned int ts, int status),
+	TP_ARGS(id, ts, status),
+	TP_STRUCT__entry(
+		__field(unsigned int, id)
+		__field(unsigned int, ts)
+		__field(int, status)
+	),
+	TP_fast_assign(
+		__entry->id = id;
+		__entry->ts = ts;
+		__entry->status = status;
+	),
+	TP_printk(
+		"ctx=%u ts=%u status=%d",
+			__entry->id, __entry->ts, __entry->status
+	)
+);
+
+TRACE_EVENT(adreno_gpu_fault,
+	TP_PROTO(unsigned int status, unsigned int rptr, unsigned int wptr,
+		unsigned int ib1base, unsigned int ib1size,
+		unsigned int ib2base, unsigned int ib2size),
+	TP_ARGS(status, rptr, wptr, ib1base, ib1size, ib2base, ib2size),
+	TP_STRUCT__entry(
+		__field(unsigned int, status)
+		__field(unsigned int, rptr)
+		__field(unsigned int, wptr)
+		__field(unsigned int, ib1base)
+		__field(unsigned int, ib1size)
+		__field(unsigned int, ib2base)
+		__field(unsigned int, ib2size)
+	),
+	TP_fast_assign(
+		__entry->status = status;
+		__entry->rptr = rptr;
+		__entry->wptr = wptr;
+		__entry->ib1base = ib1base;
+		__entry->ib1size = ib1size;
+		__entry->ib2base = ib2base;
+		__entry->ib2size = ib2size;
+	),
+	TP_printk("status=%X RB=%X/%X IB1=%X/%X IB2=%X/%X",
+		__entry->status, __entry->wptr, __entry->rptr,
+		__entry->ib1base, __entry->ib1size, __entry->ib2base,
+		__entry->ib2size)
+);
+
+#endif /* _ADRENO_TRACE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
index d87e8acf092e814eacfa86640166a657f5d13614..bf7ec97a8b7c7e2965fedc8fb00fb769cc3fbee1 100644
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -14,6 +14,7 @@
 #include <linux/fb.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/list.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/interrupt.h>
@@ -29,6 +30,8 @@
 #include <linux/io.h>
 #include <mach/socinfo.h>
 #include <linux/mman.h>
+#include <linux/sort.h>
+#include <asm/cacheflush.h>
 
 #include "kgsl.h"
 #include "kgsl_debugfs.h"
@@ -53,6 +56,45 @@ MODULE_PARM_DESC(ksgl_mmu_type,
 
 static struct ion_client *kgsl_ion_client;
 
+/**
+ * kgsl_trace_issueibcmds() - Call trace_issueibcmds by proxy
+ * device: KGSL device
+ * id: ID of the context submitting the command
+ * cmdbatch: Pointer to kgsl_cmdbatch describing these commands
+ * timestamp: Timestamp assigned to the command batch
+ * flags: Flags sent by the user
+ * result: Result of the submission attempt
+ * type: Type of context issuing the command
+ *
+ * Wrap the issueibcmds ftrace hook into a function that can be called from the
+ * GPU specific modules.
+ */
+void kgsl_trace_issueibcmds(struct kgsl_device *device, int id,
+		struct kgsl_cmdbatch *cmdbatch,
+		unsigned int timestamp, unsigned int flags,
+		int result, unsigned int type)
+{
+	trace_kgsl_issueibcmds(device, id, cmdbatch,
+		timestamp, flags, result, type);
+}
+EXPORT_SYMBOL(kgsl_trace_issueibcmds);
+
+/**
+ * kgsl_trace_regwrite - call regwrite ftrace function by proxy
+ * device: KGSL device
+ * offset: dword offset of the register being written
+ * value: Value of the register being written
+ *
+ * Wrap the regwrite ftrace hook into a function that can be called from the
+ * GPU specific modules.
+ */
+void kgsl_trace_regwrite(struct kgsl_device *device, unsigned int offset,
+		unsigned int value)
+{
+	trace_kgsl_regwrite(device, offset, value);
+}
+EXPORT_SYMBOL(kgsl_trace_regwrite);
+
 int kgsl_memfree_hist_init(void)
 {
 	void *base;
@@ -104,10 +146,13 @@ void kgsl_memfree_hist_set_event(unsigned int pid, unsigned int gpuaddr,
  * @ptbase - the pagetable base of the object
  * @gpuaddr - the GPU address of the object
  * @size - Size of the region to search
+ *
+ * Caller must kgsl_mem_entry_put() the returned entry when finished using it.
  */
 
-struct kgsl_mem_entry *kgsl_get_mem_entry(struct kgsl_device *device,
-	unsigned int ptbase, unsigned int gpuaddr, unsigned int size)
+struct kgsl_mem_entry * __must_check
+kgsl_get_mem_entry(struct kgsl_device *device, unsigned int ptbase,
+		   unsigned int gpuaddr, unsigned int size)
 {
 	struct kgsl_process_private *priv;
 	struct kgsl_mem_entry *entry;
@@ -117,15 +162,12 @@ struct kgsl_mem_entry *kgsl_get_mem_entry(struct kgsl_device *device,
 	list_for_each_entry(priv, &kgsl_driver.process_list, list) {
 		if (!kgsl_mmu_pt_equal(&device->mmu, priv->pagetable, ptbase))
 			continue;
-		spin_lock(&priv->mem_lock);
 		entry = kgsl_sharedmem_find_region(priv, gpuaddr, size);
 
 		if (entry) {
-			spin_unlock(&priv->mem_lock);
 			mutex_unlock(&kgsl_driver.process_mutex);
 			return entry;
 		}
-		spin_unlock(&priv->mem_lock);
 	}
 	mutex_unlock(&kgsl_driver.process_mutex);
 
@@ -268,16 +310,28 @@ err:
 
 static void kgsl_mem_entry_detach_process(struct kgsl_mem_entry *entry)
 {
+	bool had_gpuaddr = false;
+
 	if (entry == NULL)
 		return;
 
+	/*
+	 * Unmap the entry first so that there isn't a period of
+	 * time where kgsl doesn't know about the address range
+	 * but it is still present in the pagetable. Unmapping will
+	 * clear the gpuaddr field, so remember if we had a mapping,
+	 * and an rbtree entry for later.
+	 */
+	had_gpuaddr = entry->memdesc.gpuaddr != 0;
+	kgsl_mmu_unmap(entry->memdesc.pagetable, &entry->memdesc);
+
 	spin_lock(&entry->priv->mem_lock);
 
 	if (entry->id != 0)
 		idr_remove(&entry->priv->mem_idr, entry->id);
 	entry->id = 0;
 
-	if (entry->memdesc.gpuaddr != 0)
+	if (had_gpuaddr)
 		rb_erase(&entry->node, &entry->priv->mem_rb);
 
 	spin_unlock(&entry->priv->mem_lock);
@@ -285,64 +339,66 @@ static void kgsl_mem_entry_detach_process(struct kgsl_mem_entry *entry)
 	entry->priv->stats[entry->memtype].cur -= entry->memdesc.size;
 	entry->priv = NULL;
 
-	kgsl_mmu_unmap(entry->memdesc.pagetable, &entry->memdesc);
 
 	kgsl_mem_entry_put(entry);
 }
 
-/* Allocate a new context id */
-
-static struct kgsl_context *
-kgsl_create_context(struct kgsl_device_private *dev_priv)
+/**
+ * kgsl_context_init() - helper to initialize kgsl_context members
+ * @dev_priv: the owner of the context
+ * @context: the newly created context struct, should be allocated by
+ * the device specific drawctxt_create function.
+ *
+ * This is a helper function for the device specific drawctxt_create
+ * function to initialize the common members of its context struct.
+ * If this function succeeds, reference counting is active in the context
+ * struct and the caller should kgsl_context_put() it on error.
+ * If it fails, the caller should just free the context structer
+ * it passed in.
+ */
+int kgsl_context_init(struct kgsl_device_private *dev_priv,
+			struct kgsl_context *context)
 {
-	struct kgsl_context *context;
-	int ret, id;
-
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-
-	if (context == NULL) {
-		KGSL_DRV_INFO(dev_priv->device, "kzalloc(%d) failed\n",
-				sizeof(*context));
-		return ERR_PTR(-ENOMEM);
-	}
+	int ret = 0, id;
+	struct kgsl_device *device = dev_priv->device;
 
 	while (1) {
-		if (idr_pre_get(&dev_priv->device->context_idr,
-				GFP_KERNEL) == 0) {
-			KGSL_DRV_INFO(dev_priv->device,
-					"idr_pre_get: ENOMEM\n");
+		if (idr_pre_get(&device->context_idr, GFP_KERNEL) == 0) {
+			KGSL_DRV_INFO(device, "idr_pre_get: ENOMEM\n");
 			ret = -ENOMEM;
-			goto func_end;
+			break;
 		}
 
-		ret = idr_get_new_above(&dev_priv->device->context_idr,
-				  context, 1, &id);
+		write_lock(&device->context_lock);
+		ret = idr_get_new_above(&device->context_idr, context, 1, &id);
+		context->id = id;
+		write_unlock(&device->context_lock);
 
 		if (ret != -EAGAIN)
 			break;
 	}
 
 	if (ret)
-		goto func_end;
+		goto fail;
 
 	/* MAX - 1, there is one memdesc in memstore for device info */
 	if (id >= KGSL_MEMSTORE_MAX) {
-		KGSL_DRV_ERR(dev_priv->device, "cannot have more than %d "
+		KGSL_DRV_INFO(device, "cannot have more than %d "
 				"ctxts due to memstore limitation\n",
 				KGSL_MEMSTORE_MAX);
-		idr_remove(&dev_priv->device->context_idr, id);
 		ret = -ENOSPC;
-		goto func_end;
+		goto fail_free_id;
 	}
 
 	kref_init(&context->refcount);
-	context->id = id;
-	context->dev_priv = dev_priv;
+	context->device = dev_priv->device;
+	context->pagetable = dev_priv->process_priv->pagetable;
+
+	context->pid = dev_priv->process_priv->pid;
 
 	ret = kgsl_sync_timeline_create(context);
 	if (ret) {
-		idr_remove(&dev_priv->device->context_idr, id);
-		goto func_end;
+		goto fail_free_id;
 	}
 
 	/* Initialize the pending event list */
@@ -358,50 +414,57 @@ kgsl_create_context(struct kgsl_device_private *dev_priv)
 	 */
 
 	INIT_LIST_HEAD(&context->events_list);
-
-func_end:
-	if (ret) {
-		kfree(context);
-		return ERR_PTR(ret);
-	}
-
-	return context;
+	return 0;
+fail_free_id:
+	write_lock(&device->context_lock);
+	idr_remove(&dev_priv->device->context_idr, id);
+	write_unlock(&device->context_lock);
+fail:
+	return ret;
 }
+EXPORT_SYMBOL(kgsl_context_init);
 
 /**
- * kgsl_context_detach - Release the "master" context reference
- * @context - The context that will be detached
+ * kgsl_context_detach() - Release the "master" context reference
+ * @context: The context that will be detached
  *
  * This is called when a context becomes unusable, because userspace
  * has requested for it to be destroyed. The context itself may
  * exist a bit longer until its reference count goes to zero.
  * Other code referencing the context can detect that it has been
- * detached because the context id will be set to KGSL_CONTEXT_INVALID.
+ * detached by checking the KGSL_CONTEXT_DETACHED bit in
+ * context->priv.
  */
-void
-kgsl_context_detach(struct kgsl_context *context)
+int kgsl_context_detach(struct kgsl_context *context)
 {
-	int id;
 	struct kgsl_device *device;
-	if (context == NULL)
-		return;
-	device = context->dev_priv->device;
+	int ret;
+
+	if (context == NULL || kgsl_context_detached(context))
+		return -EINVAL;
+
+	device = context->device;
+
 	trace_kgsl_context_detach(device, context);
-	id = context->id;
 
-	if (device->ftbl->drawctxt_destroy)
-		device->ftbl->drawctxt_destroy(device, context);
-	/*device specific drawctxt_destroy MUST clean up devctxt */
-	BUG_ON(context->devctxt);
+	/*
+	 * Mark the context as detached to keep others from using
+	 * the context before it gets fully removed
+	 */
+	set_bit(KGSL_CONTEXT_DETACHED, &context->priv);
+
+	ret = device->ftbl->drawctxt_detach(context);
+
 	/*
 	 * Cancel events after the device-specific context is
 	 * destroyed, to avoid possibly freeing memory while
 	 * it is still in use by the GPU.
 	 */
+
 	kgsl_context_cancel_events(device, context);
-	idr_remove(&device->context_idr, id);
-	context->id = KGSL_CONTEXT_INVALID;
 	kgsl_context_put(context);
+
+	return ret;
 }
 
 void
@@ -409,29 +472,21 @@ kgsl_context_destroy(struct kref *kref)
 {
 	struct kgsl_context *context = container_of(kref, struct kgsl_context,
 						    refcount);
-	kgsl_sync_timeline_destroy(context);
-	kfree(context);
-}
+	struct kgsl_device *device = context->device;
 
-static void kgsl_check_idle_locked(struct kgsl_device *device)
-{
-	if (device->pwrctrl.nap_allowed == true &&
-	    device->state == KGSL_STATE_ACTIVE &&
-		device->requested_state == KGSL_STATE_NONE) {
-		kgsl_pwrctrl_request_state(device, KGSL_STATE_NAP);
-		kgsl_pwrscale_idle(device);
-		if (kgsl_pwrctrl_sleep(device) != 0)
-			mod_timer(&device->idle_timer,
-				  jiffies +
-				  device->pwrctrl.interval_timeout);
+	trace_kgsl_context_destroy(device, context);
+
+	BUG_ON(!kgsl_context_detached(context));
+
+	write_lock(&device->context_lock);
+	if (context->id != KGSL_CONTEXT_INVALID) {
+		idr_remove(&device->context_idr, context->id);
+		context->id = KGSL_CONTEXT_INVALID;
 	}
-}
+	write_unlock(&device->context_lock);
+	kgsl_sync_timeline_destroy(context);
 
-static void kgsl_check_idle(struct kgsl_device *device)
-{
-	mutex_lock(&device->mutex);
-	kgsl_check_idle_locked(device);
-	mutex_unlock(&device->mutex);
+	device->ftbl->drawctxt_destroy(context);
 }
 
 struct kgsl_device *kgsl_get_device(int dev_idx)
@@ -496,23 +551,23 @@ static int kgsl_suspend_device(struct kgsl_device *device, pm_message_t state)
 	policy_saved = device->pwrscale.policy;
 	device->pwrscale.policy = NULL;
 	kgsl_pwrctrl_request_state(device, KGSL_STATE_SUSPEND);
-	/* Make sure no user process is waiting for a timestamp *
-	 * before supending */
-	if (device->active_cnt != 0) {
-		mutex_unlock(&device->mutex);
-		wait_for_completion(&device->suspend_gate);
-		mutex_lock(&device->mutex);
-	}
+
+	/* Tell the device to drain the submission queue */
+	device->ftbl->drain(device);
+
+	/* Wait for the active count to hit zero */
+	kgsl_active_count_wait(device);
+
 	/* Don't let the timer wake us during suspended sleep. */
 	del_timer_sync(&device->idle_timer);
 	switch (device->state) {
 		case KGSL_STATE_INIT:
 			break;
 		case KGSL_STATE_ACTIVE:
-			/* Wait for the device to become idle */
-			device->ftbl->idle(device);
 		case KGSL_STATE_NAP:
 		case KGSL_STATE_SLEEP:
+			/* make sure power is on to stop the device */
+			kgsl_pwrctrl_enable(device);
 			/* Get the completion ready to be waited upon. */
 			INIT_COMPLETION(device->hwaccess_gate);
 			device->ftbl->suspend_context(device);
@@ -600,9 +655,14 @@ void kgsl_early_suspend_driver(struct early_suspend *h)
 					struct kgsl_device, display_off);
 	KGSL_PWR_WARN(device, "early suspend start\n");
 	mutex_lock(&device->mutex);
-	device->pwrctrl.restore_slumber = true;
-	kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER);
-	kgsl_pwrctrl_sleep(device);
+
+	/* Only go to slumber if active_cnt is 0 */
+	if (atomic_read(&device->active_cnt) == 0) {
+		device->pwrctrl.restore_slumber = true;
+		kgsl_pwrctrl_request_state(device, KGSL_STATE_SLUMBER);
+		kgsl_pwrctrl_sleep(device);
+	}
+
 	mutex_unlock(&device->mutex);
 	KGSL_PWR_WARN(device, "early suspend end\n");
 }
@@ -632,24 +692,121 @@ void kgsl_late_resume_driver(struct early_suspend *h)
 	device->pwrctrl.restore_slumber = false;
 	if (device->pwrscale.policy == NULL)
 		kgsl_pwrctrl_pwrlevel_change(device, KGSL_PWRLEVEL_TURBO);
-	kgsl_pwrctrl_wake(device);
+	if (kgsl_pwrctrl_wake(device) != 0)
+		return;
+	/*
+	 * We don't have a way to go directly from
+	 * a deeper sleep state to NAP, which is
+	 * the desired state here.
+	 *
+	 * Except if active_cnt is non zero which means that
+	 * we probably went to early_suspend with it non zero
+	 * and thus the system is still in an active state.
+	 */
+
+	if (atomic_read(&device->active_cnt) == 0) {
+		kgsl_pwrctrl_request_state(device, KGSL_STATE_NAP);
+		kgsl_pwrctrl_sleep(device);
+	}
+
 	mutex_unlock(&device->mutex);
-	kgsl_check_idle(device);
 	KGSL_PWR_WARN(device, "late resume end\n");
 }
 EXPORT_SYMBOL(kgsl_late_resume_driver);
 
-/* file operations */
+/*
+ * kgsl_destroy_process_private() - Cleanup function to free process private
+ * @kref: - Pointer to object being destroyed's kref struct
+ * Free struct object and all other resources attached to it.
+ * Since the function can be used when not all resources inside process
+ * private have been allocated, there is a check to (before each resource
+ * cleanup) see if the struct member being cleaned is in fact allocated or not.
+ * If the value is not NULL, resource is freed.
+ */
+static void kgsl_destroy_process_private(struct kref *kref)
+{
+
+	struct kgsl_mem_entry *entry = NULL;
+	int next = 0;
+
+
+	struct kgsl_process_private *private = container_of(kref,
+			struct kgsl_process_private, refcount);
+
+	/*
+	 * Remove this process from global process list
+	 * We do not acquire a lock first as it is expected that
+	 * kgsl_destroy_process_private() is only going to be called
+	 * through kref_put() which is only called after acquiring
+	 * the lock.
+	 */
+	if (!private) {
+		KGSL_CORE_ERR("Cannot destroy null process private\n");
+		mutex_unlock(&kgsl_driver.process_mutex);
+		return;
+	}
+	list_del(&private->list);
+	mutex_unlock(&kgsl_driver.process_mutex);
+
+	if (private->kobj.parent)
+		kgsl_process_uninit_sysfs(private);
+	if (private->debug_root)
+		debugfs_remove_recursive(private->debug_root);
+
+	while (1) {
+		rcu_read_lock();
+		entry = idr_get_next(&private->mem_idr, &next);
+		rcu_read_unlock();
+		if (entry == NULL)
+			break;
+		kgsl_mem_entry_detach_process(entry);
+		/*
+		 * Always start back at the beginning, to
+		 * ensure all entries are removed,
+		 * like list_for_each_entry_safe.
+		 */
+		next = 0;
+	}
+	kgsl_mmu_putpagetable(private->pagetable);
+	idr_destroy(&private->mem_idr);
+
+	kfree(private);
+	return;
+}
+
+static void
+kgsl_put_process_private(struct kgsl_device *device,
+			 struct kgsl_process_private *private)
+{
+	mutex_lock(&kgsl_driver.process_mutex);
+
+	/*
+	 * kref_put() returns 1 when the refcnt has reached 0 and the destroy
+	 * function is called. Mutex is released in the destroy function if
+	 * its called, so only release mutex if kref_put() return 0
+	 */
+	if (!kref_put(&private->refcount, kgsl_destroy_process_private))
+		mutex_unlock(&kgsl_driver.process_mutex);
+	return;
+}
+
+/*
+ * find_process_private() - Helper function to search for process private
+ * @cur_dev_priv: Pointer to device private structure which contains pointers
+ * to device and process_private structs.
+ * Returns: Pointer to the found/newly created private struct
+ */
 static struct kgsl_process_private *
-kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv)
+kgsl_find_process_private(struct kgsl_device_private *cur_dev_priv)
 {
 	struct kgsl_process_private *private;
 
+	/* Search in the process list */
 	mutex_lock(&kgsl_driver.process_mutex);
 	list_for_each_entry(private, &kgsl_driver.process_list, list) {
 		if (private->pid == task_tgid_nr(current)) {
-			private->refcnt++;
-			goto out;
+			kref_get(&private->refcount);
+			goto done;
 		}
 	}
 
@@ -658,80 +815,65 @@ kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv)
 	if (private == NULL) {
 		KGSL_DRV_ERR(cur_dev_priv->device, "kzalloc(%d) failed\n",
 			sizeof(struct kgsl_process_private));
-		goto out;
+		goto done;
 	}
 
-	spin_lock_init(&private->mem_lock);
-	private->refcnt = 1;
+	kref_init(&private->refcount);
+
 	private->pid = task_tgid_nr(current);
-	private->mem_rb = RB_ROOT;
+	spin_lock_init(&private->mem_lock);
+	mutex_init(&private->process_private_mutex);
+	/* Add the newly created process struct obj to the process list */
+	list_add(&private->list, &kgsl_driver.process_list);
+done:
+	mutex_unlock(&kgsl_driver.process_mutex);
+	return private;
+}
 
-	idr_init(&private->mem_idr);
+/*
+ * kgsl_get_process_private() - Used to find the process private structure
+ * @cur_dev_priv: Current device pointer
+ * Finds or creates a new porcess private structire and initializes its members
+ * Returns: Pointer to the private process struct obj found/created or
+ * NULL if pagetable creation for this process private obj failed.
+ */
+static struct kgsl_process_private *
+kgsl_get_process_private(struct kgsl_device_private *cur_dev_priv)
+{
+	struct kgsl_process_private *private;
 
-	if (kgsl_mmu_enabled())
-	{
+	private = kgsl_find_process_private(cur_dev_priv);
+
+	mutex_lock(&private->process_private_mutex);
+
+	if (!private->mem_rb.rb_node) {
+		private->mem_rb = RB_ROOT;
+		idr_init(&private->mem_idr);
+	}
+
+	if ((!private->pagetable) && kgsl_mmu_enabled()) {
 		unsigned long pt_name;
 
 		pt_name = task_tgid_nr(current);
 		private->pagetable = kgsl_mmu_getpagetable(pt_name);
 		if (private->pagetable == NULL) {
-			kfree(private);
-			private = NULL;
-			goto out;
+			mutex_unlock(&private->process_private_mutex);
+			kgsl_put_process_private(cur_dev_priv->device,
+						private);
+			return NULL;
 		}
 	}
 
-	list_add(&private->list, &kgsl_driver.process_list);
+	if (!private->kobj.parent)
+		kgsl_process_init_sysfs(private);
+	if (!private->debug_root)
+		kgsl_process_init_debugfs(private);
 
-	kgsl_process_init_sysfs(private);
-	kgsl_process_init_debugfs(private);
+	mutex_unlock(&private->process_private_mutex);
 
-out:
-	mutex_unlock(&kgsl_driver.process_mutex);
 	return private;
 }
 
-static void
-kgsl_put_process_private(struct kgsl_device *device,
-			 struct kgsl_process_private *private)
-{
-	struct kgsl_mem_entry *entry = NULL;
-	int next = 0;
-
-	if (!private)
-		return;
-
-	mutex_lock(&kgsl_driver.process_mutex);
-
-	if (--private->refcnt)
-		goto unlock;
-
-	kgsl_process_uninit_sysfs(private);
-	debugfs_remove_recursive(private->debug_root);
-
-	list_del(&private->list);
-
-	while (1) {
-		rcu_read_lock();
-		entry = idr_get_next(&private->mem_idr, &next);
-		rcu_read_unlock();
-		if (entry == NULL)
-			break;
-		kgsl_mem_entry_detach_process(entry);
-		/*
-		 * Always start back at the beginning, to
-		 * ensure all entries are removed,
-		 * like list_for_each_entry_safe.
-		 */
-		next = 0;
-	}
-	kgsl_mmu_putpagetable(private->pagetable);
-	idr_destroy(&private->mem_idr);
-	kfree(private);
-unlock:
-	mutex_unlock(&kgsl_driver.process_mutex);
-}
-
 static int kgsl_release(struct inode *inodep, struct file *filep)
 {
 	int result = 0;
@@ -744,15 +886,27 @@ static int kgsl_release(struct inode *inodep, struct file *filep)
 	filep->private_data = NULL;
 
 	mutex_lock(&device->mutex);
-	kgsl_check_suspended(device);
+	kgsl_active_count_get(device);
 
 	while (1) {
+		read_lock(&device->context_lock);
 		context = idr_get_next(&device->context_idr, &next);
+		read_unlock(&device->context_lock);
+
 		if (context == NULL)
 			break;
 
-		if (context->dev_priv == dev_priv)
+
+		if (context->pid == private->pid) {
+			/*
+			 * Hold a reference to the context in case somebody
+			 * tries to put it while we are detaching
+			 */
+
+			_kgsl_context_get(context);
 			kgsl_context_detach(context);
+			kgsl_context_put(context);
+		}
 
 		next = next + 1;
 	}
@@ -766,10 +920,17 @@ static int kgsl_release(struct inode *inodep, struct file *filep)
 
 	device->open_count--;
 	if (device->open_count == 0) {
+		BUG_ON(atomic_read(&device->active_cnt) > 1);
 		result = device->ftbl->stop(device);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
+		/*
+		 * active_cnt special case: we just stopped the device,
+		 * so no need to use kgsl_active_count_put()
+		 */
+		atomic_dec(&device->active_cnt);
+	} else {
+		kgsl_active_count_put(device);
 	}
-
 	mutex_unlock(&device->mutex);
 	kfree(dev_priv);
 
@@ -815,19 +976,27 @@ static int kgsl_open(struct inode *inodep, struct file *filep)
 	filep->private_data = dev_priv;
 
 	mutex_lock(&device->mutex);
-	kgsl_check_suspended(device);
 
 	if (device->open_count == 0) {
+		/*
+		 * active_cnt special case: we are starting up for the first
+		 * time, so use this sequence instead of the kgsl_pwrctrl_wake()
+		 * which will be called by kgsl_active_count_get().
+		 */
+		atomic_inc(&device->active_cnt);
 		kgsl_sharedmem_set(&device->memstore, 0, 0,
 				device->memstore.size);
 
-		result = device->ftbl->start(device, true);
+		result = device->ftbl->init(device);
+		if (result)
+			goto err_freedevpriv;
 
-		if (result) {
-			mutex_unlock(&device->mutex);
+		result = device->ftbl->start(device);
+		if (result)
 			goto err_freedevpriv;
-		}
+
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
+		kgsl_active_count_put(device);
 	}
 	device->open_count++;
 	mutex_unlock(&device->mutex);
@@ -853,11 +1022,17 @@ err_stop:
 	mutex_lock(&device->mutex);
 	device->open_count--;
 	if (device->open_count == 0) {
+		/* make sure power is on to stop the device */
+		kgsl_pwrctrl_enable(device);
 		result = device->ftbl->stop(device);
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_INIT);
 	}
-	mutex_unlock(&device->mutex);
 err_freedevpriv:
+	/* only the first open takes an active count */
+	if (device->open_count == 0)
+		atomic_dec(&device->active_cnt);
+
+	mutex_unlock(&device->mutex);
 	filep->private_data = NULL;
 	kfree(dev_priv);
 err_pmruntime:
@@ -865,8 +1040,17 @@ err_pmruntime:
 	return result;
 }
 
-/*call with private->mem_lock locked */
-struct kgsl_mem_entry *
+/**
+ * kgsl_sharedmem_find_region() - Find a gpu memory allocation
+ *
+ * @private: private data for the process to check.
+ * @gpuaddr: start address of the region
+ * @size: size of the region
+ *
+ * Find a gpu allocation. Caller must kgsl_mem_entry_put()
+ * the returned entry when finished using it.
+ */
+struct kgsl_mem_entry * __must_check
 kgsl_sharedmem_find_region(struct kgsl_process_private *private,
 	unsigned int gpuaddr, size_t size)
 {
@@ -875,46 +1059,57 @@ kgsl_sharedmem_find_region(struct kgsl_process_private *private,
 	if (!kgsl_mmu_gpuaddr_in_range(gpuaddr))
 		return NULL;
 
+	spin_lock(&private->mem_lock);
 	while (node != NULL) {
 		struct kgsl_mem_entry *entry;
 
 		entry = rb_entry(node, struct kgsl_mem_entry, node);
 
-
-		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size))
+		if (kgsl_gpuaddr_in_memdesc(&entry->memdesc, gpuaddr, size)) {
+			kgsl_mem_entry_get(entry);
+			spin_unlock(&private->mem_lock);
 			return entry;
-
+		}
 		if (gpuaddr < entry->memdesc.gpuaddr)
 			node = node->rb_left;
 		else if (gpuaddr >=
 			(entry->memdesc.gpuaddr + entry->memdesc.size))
 			node = node->rb_right;
 		else {
+			spin_unlock(&private->mem_lock);
 			return NULL;
 		}
 	}
+	spin_unlock(&private->mem_lock);
 
 	return NULL;
 }
 EXPORT_SYMBOL(kgsl_sharedmem_find_region);
 
-/*call with private->mem_lock locked */
-static inline struct kgsl_mem_entry *
+/**
+ * kgsl_sharedmem_find() - Find a gpu memory allocation
+ *
+ * @private: private data for the process to check.
+ * @gpuaddr: start address of the region
+ *
+ * Find a gpu allocation. Caller must kgsl_mem_entry_put()
+ * the returned entry when finished using it.
+ */
+static inline struct kgsl_mem_entry * __must_check
 kgsl_sharedmem_find(struct kgsl_process_private *private, unsigned int gpuaddr)
 {
 	return kgsl_sharedmem_find_region(private, gpuaddr, 1);
 }
 
 /**
- * kgsl_sharedmem_region_empty - Check if an addression region is empty
+ * kgsl_sharedmem_region_empty() - Check if an addression region is empty
  *
  * @private: private data for the process to check.
  * @gpuaddr: start address of the region
  * @size: length of the region.
  *
  * Checks that there are no existing allocations within an address
- * region. Note that unlike other kgsl_sharedmem* search functions,
- * this one manages locking on its own.
+ * region.
  */
 int
 kgsl_sharedmem_region_empty(struct kgsl_process_private *private,
@@ -958,19 +1153,24 @@ kgsl_sharedmem_region_empty(struct kgsl_process_private *private,
 }
 
 /**
- * kgsl_sharedmem_find_id - find a memory entry by id
+ * kgsl_sharedmem_find_id() - find a memory entry by id
  * @process: the owning process
  * @id: id to find
  *
  * @returns - the mem_entry or NULL
+ *
+ * Caller must kgsl_mem_entry_put() the returned entry, when finished using
+ * it.
  */
-static inline struct kgsl_mem_entry *
+static inline struct kgsl_mem_entry * __must_check
 kgsl_sharedmem_find_id(struct kgsl_process_private *process, unsigned int id)
 {
 	struct kgsl_mem_entry *entry;
 
 	rcu_read_lock();
 	entry = idr_find(&process->mem_idr, id);
+	if (entry)
+		kgsl_mem_entry_get(entry);
 	rcu_read_unlock();
 
 	return entry;
@@ -1035,177 +1235,696 @@ static long kgsl_ioctl_device_getproperty(struct kgsl_device_private *dev_priv,
 			context->reset_status = KGSL_CTX_STAT_NO_ERROR;
 		}
 
-		kgsl_context_put(context);
-		break;
-	}
-	default:
-		result = dev_priv->device->ftbl->getproperty(
-					dev_priv->device, param->type,
-					param->value, param->sizebytes);
+		kgsl_context_put(context);
+		break;
+	}
+	default:
+		result = dev_priv->device->ftbl->getproperty(
+					dev_priv->device, param->type,
+					param->value, param->sizebytes);
+	}
+
+
+	return result;
+}
+
+static long kgsl_ioctl_device_setproperty(struct kgsl_device_private *dev_priv,
+					  unsigned int cmd, void *data)
+{
+	int result = 0;
+	/* The getproperty struct is reused for setproperty too */
+	struct kgsl_device_getproperty *param = data;
+
+	if (dev_priv->device->ftbl->setproperty)
+		result = dev_priv->device->ftbl->setproperty(
+			dev_priv->device, param->type,
+			param->value, param->sizebytes);
+
+	return result;
+}
+
+static long _device_waittimestamp(struct kgsl_device_private *dev_priv,
+		struct kgsl_context *context,
+		unsigned int timestamp,
+		unsigned int timeout)
+{
+	int result = 0;
+	struct kgsl_device *device = dev_priv->device;
+	unsigned int context_id = context ? context->id : KGSL_MEMSTORE_GLOBAL;
+
+	trace_kgsl_waittimestamp_entry(device, context_id,
+				       kgsl_readtimestamp(device, context,
+							KGSL_TIMESTAMP_RETIRED),
+				       timestamp, timeout);
+
+	result = device->ftbl->waittimestamp(dev_priv->device,
+					context, timestamp, timeout);
+
+	trace_kgsl_waittimestamp_exit(device,
+				      kgsl_readtimestamp(device, context,
+							KGSL_TIMESTAMP_RETIRED),
+				      result);
+
+	return result;
+}
+
+static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private
+						*dev_priv, unsigned int cmd,
+						void *data)
+{
+	struct kgsl_device_waittimestamp *param = data;
+
+	return _device_waittimestamp(dev_priv, NULL,
+			param->timestamp, param->timeout);
+}
+
+static long kgsl_ioctl_device_waittimestamp_ctxtid(struct kgsl_device_private
+						*dev_priv, unsigned int cmd,
+						void *data)
+{
+	struct kgsl_device_waittimestamp_ctxtid *param = data;
+	struct kgsl_context *context;
+	long result = -EINVAL;
+
+	context = kgsl_context_get_owner(dev_priv, param->context_id);
+
+	if (context)
+		result = _device_waittimestamp(dev_priv, context,
+			param->timestamp, param->timeout);
+
+	kgsl_context_put(context);
+	return result;
+}
+
+/*
+ * KGSL command batch management
+ * A command batch is a single submission from userland.  The cmdbatch
+ * encapsulates everything about the submission : command buffers, flags and
+ * sync points.
+ *
+ * Sync points are events that need to expire before the
+ * cmdbatch can be queued to the hardware. For each sync point a
+ * kgsl_cmdbatch_sync_event struct is created and added to a list in the
+ * cmdbatch. There can be multiple types of events both internal ones (GPU
+ * events) and external triggers. As the events expire the struct is deleted
+ * from the list. The GPU will submit the command batch as soon as the list
+ * goes empty indicating that all the sync points have been met.
+ */
+
+/**
+ * struct kgsl_cmdbatch_sync_event
+ * @type: Syncpoint type
+ * @node: Local list node for the cmdbatch sync point list
+ * @cmdbatch: Pointer to the cmdbatch that owns the sync event
+ * @context: Pointer to the KGSL context that owns the cmdbatch
+ * @timestamp: Pending timestamp for the event
+ * @handle: Pointer to a sync fence handle
+ * @device: Pointer to the KGSL device
+ */
+struct kgsl_cmdbatch_sync_event {
+	int type;
+	struct list_head node;
+	struct kgsl_cmdbatch *cmdbatch;
+	struct kgsl_context *context;
+	unsigned int timestamp;
+	struct kgsl_sync_fence_waiter *handle;
+	struct kgsl_device *device;
+	spinlock_t lock;
+};
+
+/**
+ * kgsl_cmdbatch_destroy_object() - Destroy a cmdbatch object
+ * @kref: Pointer to the kref structure for this object
+ *
+ * Actually destroy a command batch object.  Called from kgsl_cmdbatch_put
+ */
+void kgsl_cmdbatch_destroy_object(struct kref *kref)
+{
+	struct kgsl_cmdbatch *cmdbatch = container_of(kref,
+		struct kgsl_cmdbatch, refcount);
+
+	kgsl_context_put(cmdbatch->context);
+	kfree(cmdbatch->ibdesc);
+
+	kfree(cmdbatch);
+}
+EXPORT_SYMBOL(kgsl_cmdbatch_destroy_object);
+
+static void kgsl_cmdbatch_sync_expire(struct kgsl_device *device,
+	struct kgsl_cmdbatch_sync_event *event)
+{
+	int sched = 0;
+
+	spin_lock(&event->cmdbatch->lock);
+	list_del(&event->node);
+	sched = list_empty(&event->cmdbatch->synclist) ? 1 : 0;
+	spin_unlock(&event->cmdbatch->lock);
+
+	/*
+	 * if this is the last event in the list then tell
+	 * the GPU device that the cmdbatch can be submitted
+	 */
+
+	if (sched && device->ftbl->drawctxt_sched)
+		device->ftbl->drawctxt_sched(device, event->cmdbatch->context);
+}
+
+
+/*
+ * This function is called by the GPU event when the sync event timestamp
+ * expires
+ */
+static void kgsl_cmdbatch_sync_func(struct kgsl_device *device, void *priv,
+		u32 id, u32 timestamp, u32 type)
+{
+	struct kgsl_cmdbatch_sync_event *event = priv;
+
+	kgsl_cmdbatch_sync_expire(device, event);
+
+	kgsl_context_put(event->context);
+	kgsl_cmdbatch_put(event->cmdbatch);
+
+	kfree(event);
+}
+
+/**
+ * kgsl_cmdbatch_destroy() - Destroy a cmdbatch structure
+ * @cmdbatch: Pointer to the command batch object to destroy
+ *
+ * Start the process of destroying a command batch.  Cancel any pending events
+ * and decrement the refcount.
+ */
+void kgsl_cmdbatch_destroy(struct kgsl_cmdbatch *cmdbatch)
+{
+	struct kgsl_cmdbatch_sync_event *event, *tmp;
+	int canceled = 0;
+
+	spin_lock(&cmdbatch->lock);
+
+	/* Delete any pending sync points for this command batch */
+	list_for_each_entry_safe(event, tmp, &cmdbatch->synclist, node) {
+
+		switch (event->type) {
+		case KGSL_CMD_SYNCPOINT_TYPE_TIMESTAMP: {
+			/* Cancel the event if it still exists */
+			mutex_lock(&cmdbatch->device->mutex);
+			kgsl_cancel_event(cmdbatch->device, event->context,
+				event->timestamp, kgsl_cmdbatch_sync_func,
+				event);
+			canceled = 1;
+			mutex_unlock(&cmdbatch->device->mutex);
+			kgsl_context_put(event->context);
+			break;
+		}
+		case KGSL_CMD_SYNCPOINT_TYPE_FENCE:
+			canceled = kgsl_sync_fence_async_cancel(event->handle);
+			break;
+		default:
+			break;
+		}
+
+		if(canceled) {
+			list_del(&event->node);
+			kfree(event);
+
+			/*
+			 * Put back a instance of the cmdbatch for each pending event
+			 * that we canceled
+			 */
+
+			kgsl_cmdbatch_put(cmdbatch);
+		}
+	}
+	spin_unlock(&cmdbatch->lock);
+
+	kgsl_cmdbatch_put(cmdbatch);
+}
+EXPORT_SYMBOL(kgsl_cmdbatch_destroy);
+
+static void kgsl_cmdbatch_sync_fence_func(void *priv)
+{
+	struct kgsl_cmdbatch_sync_event *event = priv;
+
+	spin_lock(&event->lock);
+	kgsl_cmdbatch_sync_expire(event->device, event);
+	kgsl_cmdbatch_put(event->cmdbatch);
+	spin_unlock(&event->lock);
+	kfree(event);
+}
+
+/* kgsl_cmdbatch_add_sync_fence() - Add a new sync fence syncpoint
+ * @device: KGSL device
+ * @cmdbatch: KGSL cmdbatch to add the sync point to
+ * @priv: Private sructure passed by the user
+ *
+ * Add a new fence sync syncpoint to the cmdbatch.
+ */
+static int kgsl_cmdbatch_add_sync_fence(struct kgsl_device *device,
+		struct kgsl_cmdbatch *cmdbatch, void *priv)
+{
+	struct kgsl_cmd_syncpoint_fence *sync = priv;
+	struct kgsl_cmdbatch_sync_event *event;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+
+	if (event == NULL)
+		return -ENOMEM;
+
+	kref_get(&cmdbatch->refcount);
+
+	event->type = KGSL_CMD_SYNCPOINT_TYPE_FENCE;
+	event->cmdbatch = cmdbatch;
+	event->device = device;
+	spin_lock_init(&event->lock);
+
+	/*
+	 * Add it to the list first to account for the possiblity that the
+	 * callback will happen immediately after the call to
+	 * kgsl_sync_fence_async_wait
+	 */
+
+	spin_lock(&cmdbatch->lock);
+	list_add(&event->node, &cmdbatch->synclist);
+	spin_unlock(&cmdbatch->lock);
+
+	/*
+	 * There is a distinct race condition that can occur if the fence
+	 * callback is fired before the function has a chance to return.  The
+	 * event struct would be freed before we could write event->handle and
+	 * hilarity ensued.  Protect against this by protecting the call to
+	 * kgsl_sync_fence_async_wait and the kfree in the callback with a lock.
+	 */
+
+	spin_lock(&event->lock);
+
+	event->handle = kgsl_sync_fence_async_wait(sync->fd,
+		kgsl_cmdbatch_sync_fence_func, event);
+
+
+	if (IS_ERR_OR_NULL(event->handle)) {
+		int ret = PTR_ERR(event->handle);
+
+		spin_lock(&cmdbatch->lock);
+		list_del(&event->node);
+		spin_unlock(&cmdbatch->lock);
+
+		kgsl_cmdbatch_put(cmdbatch);
+		spin_unlock(&event->lock);
+		kfree(event);
+
+		return ret;
+	}
+
+	spin_unlock(&event->lock);
+	return 0;
+}
+
+/* kgsl_cmdbatch_add_sync_timestamp() - Add a new sync point for a cmdbatch
+ * @device: KGSL device
+ * @cmdbatch: KGSL cmdbatch to add the sync point to
+ * @priv: Private sructure passed by the user
+ *
+ * Add a new sync point timestamp event to the cmdbatch.
+ */
+static int kgsl_cmdbatch_add_sync_timestamp(struct kgsl_device *device,
+		struct kgsl_cmdbatch *cmdbatch, void *priv)
+{
+	struct kgsl_cmd_syncpoint_timestamp *sync = priv;
+	struct kgsl_context *context = kgsl_context_get(cmdbatch->device,
+		sync->context_id);
+	struct kgsl_cmdbatch_sync_event *event;
+	int ret = -EINVAL;
+
+	if (context == NULL)
+		return -EINVAL;
+
+	/* Sanity check - you can't create a sync point on your own context */
+	if (context == cmdbatch->context) {
+		KGSL_DRV_ERR(device,
+			"Cannot create a sync point on your own context\n");
+		goto done;
+	}
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (event == NULL) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	kref_get(&cmdbatch->refcount);
+
+	event->type = KGSL_CMD_SYNCPOINT_TYPE_TIMESTAMP;
+	event->cmdbatch = cmdbatch;
+	event->context = context;
+	event->timestamp = sync->timestamp;
+
+	spin_lock(&cmdbatch->lock);
+	list_add(&event->node, &cmdbatch->synclist);
+	spin_unlock(&cmdbatch->lock);
+
+	mutex_lock(&device->mutex);
+	kgsl_active_count_get(device);
+	ret = kgsl_add_event(device, context->id, sync->timestamp,
+		kgsl_cmdbatch_sync_func, event, NULL);
+	kgsl_active_count_put(device);
+	mutex_unlock(&device->mutex);
+
+	if (ret) {
+		spin_lock(&cmdbatch->lock);
+		list_del(&event->node);
+		spin_unlock(&cmdbatch->lock);
+
+		kgsl_cmdbatch_put(cmdbatch);
+		kfree(event);
+	}
+
+done:
+	if (ret)
+		kgsl_context_put(context);
+
+	return ret;
+}
+
+/**
+ * kgsl_cmdbatch_add_sync() - Add a sync point to a command batch
+ * @device: Pointer to the KGSL device struct for the GPU
+ * @cmdbatch: Pointer to the cmdbatch
+ * @sync: Pointer to the user-specified struct defining the syncpoint
+ *
+ * Create a new sync point in the cmdbatch based on the user specified
+ * parameters
+ */
+static int kgsl_cmdbatch_add_sync(struct kgsl_device *device,
+	struct kgsl_cmdbatch *cmdbatch,
+	struct kgsl_cmd_syncpoint *sync)
+{
+	void *priv;
+	int ret, psize;
+	int (*func)(struct kgsl_device *device, struct kgsl_cmdbatch *cmdbatch,
+			void *priv);
+
+	switch (sync->type) {
+	case KGSL_CMD_SYNCPOINT_TYPE_TIMESTAMP:
+		psize = sizeof(struct kgsl_cmd_syncpoint_timestamp);
+		func = kgsl_cmdbatch_add_sync_timestamp;
+		break;
+	case KGSL_CMD_SYNCPOINT_TYPE_FENCE:
+		psize = sizeof(struct kgsl_cmd_syncpoint_fence);
+		func = kgsl_cmdbatch_add_sync_fence;
+		break;
+	default:
+		KGSL_DRV_ERR(device, "Invalid sync type 0x%x\n", sync->type);
+		return -EINVAL;
+	}
+
+	if (sync->size != psize) {
+		KGSL_DRV_ERR(device, "Invalid sync size %d\n", sync->size);
+		return -EINVAL;
+	}
+
+	priv = kzalloc(sync->size, GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(priv, sync->priv, sync->size)) {
+		kfree(priv);
+		return -EFAULT;
+	}
+
+	ret = func(device, cmdbatch, priv);
+	kfree(priv);
+
+	return ret;
+}
+
+/**
+ * kgsl_cmdbatch_create() - Create a new cmdbatch structure
+ * @device: Pointer to a KGSL device struct
+ * @context: Pointer to a KGSL context struct
+ * @numibs: Number of indirect buffers to make room for in the cmdbatch
+ *
+ * Allocate an new cmdbatch structure and add enough room to store the list of
+ * indirect buffers
+ */
+static struct kgsl_cmdbatch *kgsl_cmdbatch_create(struct kgsl_device *device,
+		struct kgsl_context *context, unsigned int flags,
+		unsigned int numibs)
+{
+	struct kgsl_cmdbatch *cmdbatch = kzalloc(sizeof(*cmdbatch), GFP_KERNEL);
+	if (cmdbatch == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(flags & KGSL_CONTEXT_SYNC)) {
+		cmdbatch->ibdesc = kzalloc(sizeof(*cmdbatch->ibdesc) * numibs,
+			GFP_KERNEL);
+		if (cmdbatch->ibdesc == NULL) {
+			kfree(cmdbatch);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
+	kref_init(&cmdbatch->refcount);
+	INIT_LIST_HEAD(&cmdbatch->synclist);
+	spin_lock_init(&cmdbatch->lock);
+
+	cmdbatch->device = device;
+	cmdbatch->ibcount = (flags & KGSL_CONTEXT_SYNC) ? 0 : numibs;
+	cmdbatch->context = context;
+	cmdbatch->flags = flags;
+
+	/*
+	 * Increase the reference count on the context so it doesn't disappear
+	 * during the lifetime of this command batch
+	 */
+	_kgsl_context_get(context);
+
+	return cmdbatch;
+}
+
+/**
+ * _kgsl_cmdbatch_verify() - Perform a quick sanity check on a command batch
+ * @device: Pointer to a KGSL device that owns the command batch
+ * @cmdbatch: Number of indirect buffers to make room for in the cmdbatch
+ *
+ * Do a quick sanity test on the list of indirect buffers in a command batch
+ * verifying that the size and GPU address
+ */
+static bool _kgsl_cmdbatch_verify(struct kgsl_device *device,
+	struct kgsl_cmdbatch *cmdbatch)
+{
+	int i;
+
+	for (i = 0; i < cmdbatch->ibcount; i++) {
+		if (cmdbatch->ibdesc[i].sizedwords == 0) {
+			KGSL_DRV_ERR(device,
+				"Invalid IB: size is 0\n");
+			return false;
+		}
+
+		if (!kgsl_mmu_gpuaddr_in_range(cmdbatch->ibdesc[i].gpuaddr)) {
+			KGSL_DRV_ERR(device,
+				"Invalid IB: address 0x%X is out of range\n",
+				cmdbatch->ibdesc[i].gpuaddr);
+			return false;
+		}
 	}
 
-
-	return result;
+	return true;
 }
 
-static long kgsl_ioctl_device_setproperty(struct kgsl_device_private *dev_priv,
-					  unsigned int cmd, void *data)
+/**
+ * _kgsl_cmdbatch_create_legacy() - Create a cmdbatch from a legacy ioctl struct
+ * @device: Pointer to the KGSL device struct for the GPU
+ * @context: Pointer to the KGSL context that issued the command batch
+ * @param: Pointer to the kgsl_ringbuffer_issueibcmds struct that the user sent
+ *
+ * Create a command batch from the legacy issueibcmds format.
+ */
+static struct kgsl_cmdbatch *_kgsl_cmdbatch_create_legacy(
+		struct kgsl_device *device,
+		struct kgsl_context *context,
+		struct kgsl_ringbuffer_issueibcmds *param)
 {
-	int result = 0;
-	/* The getproperty struct is reused for setproperty too */
-	struct kgsl_device_getproperty *param = data;
+	struct kgsl_cmdbatch *cmdbatch =
+		kgsl_cmdbatch_create(device, context, param->flags, 1);
 
-	if (dev_priv->device->ftbl->setproperty)
-		result = dev_priv->device->ftbl->setproperty(
-			dev_priv->device, param->type,
-			param->value, param->sizebytes);
+	if (IS_ERR(cmdbatch))
+		return cmdbatch;
 
-	return result;
+	cmdbatch->ibdesc[0].gpuaddr = param->ibdesc_addr;
+	cmdbatch->ibdesc[0].sizedwords = param->numibs;
+	cmdbatch->ibcount = 1;
+	cmdbatch->flags = param->flags;
+
+	return cmdbatch;
 }
 
-static long _device_waittimestamp(struct kgsl_device_private *dev_priv,
+/**
+ * _kgsl_cmdbatch_create() - Create a cmdbatch from a ioctl struct
+ * @device: Pointer to the KGSL device struct for the GPU
+ * @context: Pointer to the KGSL context that issued the command batch
+ * @flags: Flags passed in from the user command
+ * @cmdlist: Pointer to the list of commands from the user
+ * @numcmds: Number of commands in the list
+ * @synclist: Pointer to the list of syncpoints from the user
+ * @numsyncs: Number of syncpoints in the list
+ *
+ * Create a command batch from the standard issueibcmds format sent by the user.
+ */
+static struct kgsl_cmdbatch *_kgsl_cmdbatch_create(struct kgsl_device *device,
 		struct kgsl_context *context,
-		unsigned int timestamp,
-		unsigned int timeout)
+		unsigned int flags,
+		unsigned int cmdlist, unsigned int numcmds,
+		unsigned int synclist, unsigned int numsyncs)
 {
-	int result = 0;
-	struct kgsl_device *device = dev_priv->device;
-	unsigned int context_id = context ? context->id : KGSL_MEMSTORE_GLOBAL;
+	struct kgsl_cmdbatch *cmdbatch =
+		kgsl_cmdbatch_create(device, context, flags, numcmds);
+	int ret = 0;
 
-	/* Set the active count so that suspend doesn't do the wrong thing */
+	if (IS_ERR(cmdbatch))
+		return cmdbatch;
 
-	device->active_cnt++;
+	if (!(flags & KGSL_CONTEXT_SYNC)) {
+		if (copy_from_user(cmdbatch->ibdesc, (void __user *) cmdlist,
+			sizeof(struct kgsl_ibdesc) * numcmds)) {
+			ret = -EFAULT;
+			goto done;
+		}
+	}
 
-	trace_kgsl_waittimestamp_entry(device, context_id,
-				       kgsl_readtimestamp(device, context,
-							KGSL_TIMESTAMP_RETIRED),
-				       timestamp, timeout);
+	if (synclist && numsyncs) {
+		struct kgsl_cmd_syncpoint sync;
+		void  __user *uptr = (void __user *) synclist;
+		int i;
 
-	result = device->ftbl->waittimestamp(dev_priv->device,
-					context, timestamp, timeout);
+		for (i = 0; i < numsyncs; i++) {
+			memset(&sync, 0, sizeof(sync));
 
-	trace_kgsl_waittimestamp_exit(device,
-				      kgsl_readtimestamp(device, context,
-							KGSL_TIMESTAMP_RETIRED),
-				      result);
+			if (copy_from_user(&sync, uptr, sizeof(sync))) {
+				ret = -EFAULT;
+				break;
+			}
 
-	/* Fire off any pending suspend operations that are in flight */
-	kgsl_active_count_put(dev_priv->device);
+			ret = kgsl_cmdbatch_add_sync(device, cmdbatch, &sync);
 
-	return result;
-}
+			if (ret)
+				break;
 
-static long kgsl_ioctl_device_waittimestamp(struct kgsl_device_private
-						*dev_priv, unsigned int cmd,
-						void *data)
-{
-	struct kgsl_device_waittimestamp *param = data;
+			uptr += sizeof(sync);
+		}
+	}
 
-	return _device_waittimestamp(dev_priv, NULL,
-			param->timestamp, param->timeout);
+done:
+	if (ret) {
+		kgsl_cmdbatch_destroy(cmdbatch);
+		return ERR_PTR(ret);
+	}
+
+	cmdbatch->flags = flags;
+
+	return cmdbatch;
 }
 
-static long kgsl_ioctl_device_waittimestamp_ctxtid(struct kgsl_device_private
-						*dev_priv, unsigned int cmd,
-						void *data)
+static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
+				      unsigned int cmd, void *data)
 {
-	struct kgsl_device_waittimestamp_ctxtid *param = data;
+	struct kgsl_ringbuffer_issueibcmds *param = data;
+	struct kgsl_device *device = dev_priv->device;
 	struct kgsl_context *context;
+	struct kgsl_cmdbatch *cmdbatch;
 	long result = -EINVAL;
 
-	context = kgsl_context_get_owner(dev_priv, param->context_id);
+	/* The legacy functions don't support synchronization commands */
+	if (param->flags & KGSL_CONTEXT_SYNC)
+		return -EINVAL;
 
-	if (context)
-		result = _device_waittimestamp(dev_priv, context,
-			param->timestamp, param->timeout);
+	/* Get the context */
+	context = kgsl_context_get_owner(dev_priv, param->drawctxt_id);
+	if (context == NULL)
+		goto done;
+
+	if (param->flags & KGSL_CONTEXT_SUBMIT_IB_LIST) {
+		/*
+		 * Do a quick sanity check on the number of IBs in the
+		 * submission
+		 */
+
+		if (param->numibs == 0 || param->numibs > 100000)
+			goto done;
+
+		cmdbatch = _kgsl_cmdbatch_create(device, context, param->flags,
+				param->ibdesc_addr, param->numibs, 0, 0);
+	} else
+		cmdbatch = _kgsl_cmdbatch_create_legacy(device, context, param);
+
+	if (IS_ERR(cmdbatch)) {
+		result = PTR_ERR(cmdbatch);
+		goto done;
+	}
+
+	/* Run basic sanity checking on the command */
+	if (!_kgsl_cmdbatch_verify(device, cmdbatch)) {
+		KGSL_DRV_ERR(device, "Unable to verify the IBs\n");
+		goto free_cmdbatch;
+	}
 
+	result = dev_priv->device->ftbl->issueibcmds(dev_priv, context,
+		cmdbatch, &param->timestamp);
+
+free_cmdbatch:
+	if (result)
+		kgsl_cmdbatch_destroy(cmdbatch);
+
+done:
 	kgsl_context_put(context);
 	return result;
 }
 
-static long kgsl_ioctl_rb_issueibcmds(struct kgsl_device_private *dev_priv,
+static long kgsl_ioctl_submit_commands(struct kgsl_device_private *dev_priv,
 				      unsigned int cmd, void *data)
 {
-	int result = 0;
-	struct kgsl_ringbuffer_issueibcmds *param = data;
-	struct kgsl_ibdesc *ibdesc;
+	struct kgsl_submit_commands *param = data;
+	struct kgsl_device *device = dev_priv->device;
 	struct kgsl_context *context;
+	struct kgsl_cmdbatch *cmdbatch;
 
-	context = kgsl_context_get_owner(dev_priv, param->drawctxt_id);
-	if (context == NULL) {
-		result = -EINVAL;
-		goto done;
-	}
+	long result = -EINVAL;
 
-	if (param->flags & KGSL_CONTEXT_SUBMIT_IB_LIST) {
-		KGSL_DRV_INFO(dev_priv->device,
-			"Using IB list mode for ib submission, numibs: %d\n",
-			param->numibs);
-		if (!param->numibs) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"Invalid numibs as parameter: %d\n",
-				 param->numibs);
-			result = -EINVAL;
-			goto done;
-		}
+	/* The number of IBs are completely ignored for sync commands */
+	if (!(param->flags & KGSL_CONTEXT_SYNC)) {
+		if (param->numcmds == 0 || param->numcmds > 100000)
+			return -EINVAL;
+	} else if (param->numcmds != 0) {
+		KGSL_DRV_ERR(device,
+			"Commands specified with the SYNC flag.  They will be ignored\n");
+	}
 
-		/*
-		 * Put a reasonable upper limit on the number of IBs that can be
-		 * submitted
-		 */
+	context = kgsl_context_get_owner(dev_priv, param->context_id);
+	if (context == NULL)
+		return -EINVAL;
 
-		if (param->numibs > 10000) {
-			KGSL_DRV_ERR(dev_priv->device,
-				"Too many IBs submitted. count: %d max 10000\n",
-				param->numibs);
-			result = -EINVAL;
-			goto done;
-		}
+	cmdbatch = _kgsl_cmdbatch_create(device, context, param->flags,
+		(unsigned int) param->cmdlist, param->numcmds,
+		(unsigned int) param->synclist, param->numsyncs);
 
-		ibdesc = kzalloc(sizeof(struct kgsl_ibdesc) * param->numibs,
-					GFP_KERNEL);
-		if (!ibdesc) {
-			KGSL_MEM_ERR(dev_priv->device,
-				"kzalloc(%d) failed\n",
-				sizeof(struct kgsl_ibdesc) * param->numibs);
-			result = -ENOMEM;
-			goto done;
-		}
+	if (IS_ERR(cmdbatch)) {
+		result = PTR_ERR(cmdbatch);
+		goto done;
+	}
 
-		if (copy_from_user(ibdesc, (void *)param->ibdesc_addr,
-				sizeof(struct kgsl_ibdesc) * param->numibs)) {
-			result = -EFAULT;
-			KGSL_DRV_ERR(dev_priv->device,
-				"copy_from_user failed\n");
-			goto free_ibdesc;
-		}
-	} else {
-		KGSL_DRV_INFO(dev_priv->device,
-			"Using single IB submission mode for ib submission\n");
-		/* If user space driver is still using the old mode of
-		 * submitting single ib then we need to support that as well */
-		ibdesc = kzalloc(sizeof(struct kgsl_ibdesc), GFP_KERNEL);
-		if (!ibdesc) {
-			KGSL_MEM_ERR(dev_priv->device,
-				"kzalloc(%d) failed\n",
-				sizeof(struct kgsl_ibdesc));
-			result = -ENOMEM;
-			goto done;
-		}
-		ibdesc[0].gpuaddr = param->ibdesc_addr;
-		ibdesc[0].sizedwords = param->numibs;
-		param->numibs = 1;
+	/* Run basic sanity checking on the command */
+	if (!_kgsl_cmdbatch_verify(device, cmdbatch)) {
+		KGSL_DRV_ERR(device, "Unable to verify the IBs\n");
+		goto free_cmdbatch;
 	}
 
-	result = dev_priv->device->ftbl->issueibcmds(dev_priv,
-					     context,
-					     ibdesc,
-					     param->numibs,
-					     &param->timestamp,
-					     param->flags);
+	result = dev_priv->device->ftbl->issueibcmds(dev_priv, context,
+		cmdbatch, &param->timestamp);
+
+free_cmdbatch:
+	if (result)
+		kgsl_cmdbatch_destroy(cmdbatch);
 
-free_ibdesc:
-	kfree(ibdesc);
 done:
 	kgsl_context_put(context);
 	return result;
@@ -1271,15 +1990,12 @@ static long _cmdstream_freememontimestamp(struct kgsl_device_private *dev_priv,
 	struct kgsl_device *device = dev_priv->device;
 	unsigned int context_id = context ? context->id : KGSL_MEMSTORE_GLOBAL;
 
-	spin_lock(&dev_priv->process_priv->mem_lock);
 	entry = kgsl_sharedmem_find(dev_priv->process_priv, gpuaddr);
-	spin_unlock(&dev_priv->process_priv->mem_lock);
 
 	if (!entry) {
 		KGSL_DRV_ERR(dev_priv->device,
 				"invalid gpuaddr %08x\n", gpuaddr);
-		result = -EINVAL;
-		goto done;
+		return -EINVAL;
 	}
 	trace_kgsl_mem_timestamp_queue(device, entry, context_id,
 				       kgsl_readtimestamp(device, context,
@@ -1287,7 +2003,7 @@ static long _cmdstream_freememontimestamp(struct kgsl_device_private *dev_priv,
 				       timestamp);
 	result = kgsl_add_event(dev_priv->device, context_id, timestamp,
 				kgsl_freemem_event_cb, entry, dev_priv);
-done:
+	kgsl_mem_entry_put(entry);
 	return result;
 }
 
@@ -1324,27 +2040,16 @@ static long kgsl_ioctl_drawctxt_create(struct kgsl_device_private *dev_priv,
 	int result = 0;
 	struct kgsl_drawctxt_create *param = data;
 	struct kgsl_context *context = NULL;
+	struct kgsl_device *device = dev_priv->device;
 
-	context = kgsl_create_context(dev_priv);
-
+	context = device->ftbl->drawctxt_create(dev_priv, &param->flags);
 	if (IS_ERR(context)) {
 		result = PTR_ERR(context);
 		goto done;
 	}
-
-	if (dev_priv->device->ftbl->drawctxt_create) {
-		result = dev_priv->device->ftbl->drawctxt_create(
-			dev_priv->device, dev_priv->process_priv->pagetable,
-			context, &param->flags);
-		if (result)
-			goto done;
-	}
 	trace_kgsl_context_create(dev_priv->device, context, param->flags);
 	param->drawctxt_id = context->id;
 done:
-	if (result && !IS_ERR(context))
-		kgsl_context_detach(context);
-
 	return result;
 }
 
@@ -1353,14 +2058,11 @@ static long kgsl_ioctl_drawctxt_destroy(struct kgsl_device_private *dev_priv,
 {
 	struct kgsl_drawctxt_destroy *param = data;
 	struct kgsl_context *context;
-	long result = -EINVAL;
+	long result;
 
 	context = kgsl_context_get_owner(dev_priv, param->drawctxt_id);
 
-	if (context) {
-		kgsl_context_detach(context);
-		result = 0;
-	}
+	result = kgsl_context_detach(context);
 
 	kgsl_context_put(context);
 	return result;
@@ -1369,31 +2071,27 @@ static long kgsl_ioctl_drawctxt_destroy(struct kgsl_device_private *dev_priv,
 static long kgsl_ioctl_sharedmem_free(struct kgsl_device_private *dev_priv,
 					unsigned int cmd, void *data)
 {
-	int result = 0;
 	struct kgsl_sharedmem_free *param = data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
 	struct kgsl_mem_entry *entry = NULL;
 
-	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, param->gpuaddr);
-	spin_unlock(&private->mem_lock);
-
-	if (entry) {
-		trace_kgsl_mem_free(entry);
+	if (!entry) {
+		KGSL_MEM_INFO(dev_priv->device, "invalid gpuaddr %08x\n",
+				param->gpuaddr);
+		return -EINVAL;
+	}
 
-		kgsl_memfree_hist_set_event(
-			entry->priv->pid,
-			entry->memdesc.gpuaddr,
-			entry->memdesc.size,
-			entry->memdesc.flags);
+	trace_kgsl_mem_free(entry);
 
-		kgsl_mem_entry_detach_process(entry);
-	} else {
-		KGSL_CORE_ERR("invalid gpuaddr %08x\n", param->gpuaddr);
-		result = -EINVAL;
-	}
+	kgsl_memfree_hist_set_event(entry->priv->pid,
+				    entry->memdesc.gpuaddr,
+				    entry->memdesc.size,
+				    entry->memdesc.flags);
 
-	return result;
+	kgsl_mem_entry_detach_process(entry);
+	kgsl_mem_entry_put(entry);
+	return 0;
 }
 
 static long kgsl_ioctl_gpumem_free_id(struct kgsl_device_private *dev_priv,
@@ -1412,6 +2110,7 @@ static long kgsl_ioctl_gpumem_free_id(struct kgsl_device_private *dev_priv,
 	trace_kgsl_mem_free(entry);
 
 	kgsl_mem_entry_detach_process(entry);
+	kgsl_mem_entry_put(entry);
 	return 0;
 }
 
@@ -1809,6 +2508,9 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv,
 	if (!can_use_cpu_map())
 		entry->memdesc.flags &= ~KGSL_MEMFLAGS_USE_CPU_MAP;
 
+	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU)
+		entry->memdesc.priv |= KGSL_MEMDESC_GUARD_PAGE;
+
 	switch (memtype) {
 	case KGSL_USER_MEM_TYPE_PMEM:
 		if (param->fd == 0 || param->len == 0)
@@ -1873,10 +2575,7 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv,
 	else if (entry->memdesc.size >= SZ_64K)
 		kgsl_memdesc_set_align(&entry->memdesc, ilog2(SZ_64));
 
-	result = kgsl_mmu_map(private->pagetable,
-			      &entry->memdesc,
-			      GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-
+	result = kgsl_mmu_map(private->pagetable, &entry->memdesc);
 	if (result)
 		goto error_put_file_ptr;
 
@@ -1896,7 +2595,6 @@ static long kgsl_ioctl_map_user_mem(struct kgsl_device_private *dev_priv,
 
 	trace_kgsl_mem_map(entry, param->fd);
 
-	kgsl_check_idle(dev_priv->device);
 	return result;
 
 error_unmap:
@@ -1916,7 +2614,6 @@ error_put_file_ptr:
 	}
 error:
 	kfree(entry);
-	kgsl_check_idle(dev_priv->device);
 	return result;
 }
 
@@ -1945,8 +2642,10 @@ static int _kgsl_gpumem_sync_cache(struct kgsl_mem_entry *entry, int op)
 
 	mode = kgsl_memdesc_get_cachemode(&entry->memdesc);
 	if (mode != KGSL_CACHEMODE_UNCACHED
-		&& mode != KGSL_CACHEMODE_WRITECOMBINE)
+		&& mode != KGSL_CACHEMODE_WRITECOMBINE) {
+		trace_kgsl_mem_sync_cache(entry, op);
 		kgsl_cache_range_op(&entry->memdesc, cacheop);
+	}
 
 done:
 	return ret;
@@ -1961,6 +2660,7 @@ kgsl_ioctl_gpumem_sync_cache(struct kgsl_device_private *dev_priv,
 	struct kgsl_gpumem_sync_cache *param = data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
 	struct kgsl_mem_entry *entry = NULL;
+	long ret;
 
 	if (param->id != 0) {
 		entry = kgsl_sharedmem_find_id(private, param->id);
@@ -1970,9 +2670,7 @@ kgsl_ioctl_gpumem_sync_cache(struct kgsl_device_private *dev_priv,
 			return -EINVAL;
 		}
 	} else if (param->gpuaddr != 0) {
-		spin_lock(&private->mem_lock);
 		entry = kgsl_sharedmem_find(private, param->gpuaddr);
-		spin_unlock(&private->mem_lock);
 		if (entry == NULL) {
 			KGSL_MEM_INFO(dev_priv->device,
 					"can't find gpuaddr %x\n",
@@ -1983,7 +2681,100 @@ kgsl_ioctl_gpumem_sync_cache(struct kgsl_device_private *dev_priv,
 		return -EINVAL;
 	}
 
-	return _kgsl_gpumem_sync_cache(entry, param->op);
+	ret = _kgsl_gpumem_sync_cache(entry, param->op);
+	kgsl_mem_entry_put(entry);
+	return ret;
+}
+
+static int mem_id_cmp(const void *_a, const void *_b)
+{
+	const unsigned int *a = _a, *b = _b;
+	int cmp = a - b;
+	return (cmp < 0) ? -1 : (cmp > 0);
+}
+
+static long
+kgsl_ioctl_gpumem_sync_cache_bulk(struct kgsl_device_private *dev_priv,
+	unsigned int cmd, void *data)
+{
+	int i;
+	struct kgsl_gpumem_sync_cache_bulk *param = data;
+	struct kgsl_process_private *private = dev_priv->process_priv;
+	unsigned int id, last_id = 0, *id_list = NULL, actual_count = 0;
+	struct kgsl_mem_entry **entries = NULL;
+	long ret = 0;
+	size_t op_size = 0;
+	bool full_flush = false;
+
+	if (param->id_list == NULL || param->count == 0
+			|| param->count > (UINT_MAX/sizeof(unsigned int)))
+		return -EINVAL;
+
+	id_list = kzalloc(param->count * sizeof(unsigned int), GFP_KERNEL);
+	if (id_list == NULL)
+		return -ENOMEM;
+
+	entries = kzalloc(param->count * sizeof(*entries), GFP_KERNEL);
+	if (entries == NULL) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	if (copy_from_user(id_list, param->id_list,
+				param->count * sizeof(unsigned int))) {
+		ret = -EFAULT;
+		goto end;
+	}
+	/* sort the ids so we can weed out duplicates */
+	sort(id_list, param->count, sizeof(int), mem_id_cmp, NULL);
+
+	for (i = 0; i < param->count; i++) {
+		unsigned int cachemode;
+		struct kgsl_mem_entry *entry = NULL;
+
+		id = id_list[i];
+		/* skip 0 ids or duplicates */
+		if (id == last_id)
+			continue;
+
+		entry = kgsl_sharedmem_find_id(private, id);
+		if (entry == NULL)
+			continue;
+
+		/* skip uncached memory */
+		cachemode = kgsl_memdesc_get_cachemode(&entry->memdesc);
+		if (cachemode != KGSL_CACHEMODE_WRITETHROUGH &&
+		    cachemode != KGSL_CACHEMODE_WRITEBACK) {
+			kgsl_mem_entry_put(entry);
+			continue;
+		}
+
+		op_size += entry->memdesc.size;
+		entries[actual_count++] = entry;
+
+		/* If we exceed the breakeven point, flush the entire cache */
+		if (op_size >= kgsl_driver.full_cache_threshold &&
+		    param->op == KGSL_GPUMEM_CACHE_FLUSH) {
+			full_flush = true;
+			break;
+		}
+		last_id = id;
+	}
+	if (full_flush) {
+		trace_kgsl_mem_sync_full_cache(actual_count, op_size,
+					       param->op);
+		__cpuc_flush_kern_all();
+	}
+
+	for (i = 0; i < actual_count; i++) {
+		if (!full_flush)
+			_kgsl_gpumem_sync_cache(entries[i], param->op);
+		kgsl_mem_entry_put(entries[i]);
+	}
+end:
+	kfree(entries);
+	kfree(id_list);
+	return ret;
 }
 
 /* Legacy cache function, does a flush (clean  + invalidate) */
@@ -1995,10 +2786,9 @@ kgsl_ioctl_sharedmem_flush_cache(struct kgsl_device_private *dev_priv,
 	struct kgsl_sharedmem_free *param = data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
 	struct kgsl_mem_entry *entry = NULL;
+	long ret;
 
-	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find(private, param->gpuaddr);
-	spin_unlock(&private->mem_lock);
 	if (entry == NULL) {
 		KGSL_MEM_INFO(dev_priv->device,
 				"can't find gpuaddr %x\n",
@@ -2006,7 +2796,9 @@ kgsl_ioctl_sharedmem_flush_cache(struct kgsl_device_private *dev_priv,
 		return -EINVAL;
 	}
 
-	return _kgsl_gpumem_sync_cache(entry, KGSL_GPUMEM_CACHE_FLUSH);
+	ret = _kgsl_gpumem_sync_cache(entry, KGSL_GPUMEM_CACHE_FLUSH);
+	kgsl_mem_entry_put(entry);
+	return ret;
 }
 
 /*
@@ -2035,6 +2827,9 @@ _gpumem_alloc(struct kgsl_device_private *dev_priv,
 	if (entry == NULL)
 		return -ENOMEM;
 
+	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU)
+		entry->memdesc.priv |= KGSL_MEMDESC_GUARD_PAGE;
+
 	result = kgsl_allocate_user(&entry->memdesc, private->pagetable, size,
 				    flags);
 	if (result != 0)
@@ -2042,7 +2837,6 @@ _gpumem_alloc(struct kgsl_device_private *dev_priv,
 
 	entry->memtype = KGSL_MEM_ENTRY_KERNEL;
 
-	kgsl_check_idle(dev_priv->device);
 	*ret_entry = entry;
 	return result;
 err:
@@ -2065,8 +2859,7 @@ kgsl_ioctl_gpumem_alloc(struct kgsl_device_private *dev_priv,
 	if (result)
 		return result;
 
-	result = kgsl_mmu_map(private->pagetable, &entry->memdesc,
-				kgsl_memdesc_protflags(&entry->memdesc));
+	result = kgsl_mmu_map(private->pagetable, &entry->memdesc);
 	if (result)
 		goto err;
 
@@ -2104,8 +2897,7 @@ kgsl_ioctl_gpumem_alloc_id(struct kgsl_device_private *dev_priv,
 		goto err;
 
 	if (!kgsl_memdesc_use_cpu_map(&entry->memdesc)) {
-		result = kgsl_mmu_map(private->pagetable, &entry->memdesc,
-				kgsl_memdesc_protflags(&entry->memdesc));
+		result = kgsl_mmu_map(private->pagetable, &entry->memdesc);
 		if (result)
 			goto err;
 	}
@@ -2147,9 +2939,7 @@ kgsl_ioctl_gpumem_get_info(struct kgsl_device_private *dev_priv,
 			return -EINVAL;
 		}
 	} else if (param->gpuaddr != 0) {
-		spin_lock(&private->mem_lock);
 		entry = kgsl_sharedmem_find(private, param->gpuaddr);
-		spin_unlock(&private->mem_lock);
 		if (entry == NULL) {
 			KGSL_MEM_INFO(dev_priv->device,
 					"can't find gpuaddr %lx\n",
@@ -2165,6 +2955,8 @@ kgsl_ioctl_gpumem_get_info(struct kgsl_device_private *dev_priv,
 	param->size = entry->memdesc.size;
 	param->mmapsize = kgsl_memdesc_mmapsize(&entry->memdesc);
 	param->useraddr = entry->memdesc.useraddr;
+
+	kgsl_mem_entry_put(entry);
 	return result;
 }
 
@@ -2176,14 +2968,14 @@ static long kgsl_ioctl_cff_syncmem(struct kgsl_device_private *dev_priv,
 	struct kgsl_process_private *private = dev_priv->process_priv;
 	struct kgsl_mem_entry *entry = NULL;
 
-	spin_lock(&private->mem_lock);
 	entry = kgsl_sharedmem_find_region(private, param->gpuaddr, param->len);
-	if (entry)
-		kgsl_cffdump_syncmem(dev_priv, &entry->memdesc, param->gpuaddr,
-				     param->len, true);
-	else
-		result = -EINVAL;
-	spin_unlock(&private->mem_lock);
+	if (!entry)
+		return -EINVAL;
+
+	kgsl_cffdump_syncmem(dev_priv->device, &entry->memdesc, param->gpuaddr,
+			     param->len, true);
+
+	kgsl_mem_entry_put(entry);
 	return result;
 }
 
@@ -2344,8 +3136,9 @@ static const struct {
 			kgsl_ioctl_device_waittimestamp_ctxtid,
 			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_RINGBUFFER_ISSUEIBCMDS,
-			kgsl_ioctl_rb_issueibcmds,
-			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
+			kgsl_ioctl_rb_issueibcmds, 0),
+	KGSL_IOCTL_FUNC(IOCTL_KGSL_SUBMIT_COMMANDS,
+			kgsl_ioctl_submit_commands, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_CMDSTREAM_READTIMESTAMP,
 			kgsl_ioctl_cmdstream_readtimestamp,
 			KGSL_IOCTL_LOCK),
@@ -2380,7 +3173,7 @@ static const struct {
 			kgsl_ioctl_cff_user_event, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_TIMESTAMP_EVENT,
 			kgsl_ioctl_timestamp_event,
-			KGSL_IOCTL_LOCK),
+			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_SETPROPERTY,
 			kgsl_ioctl_device_setproperty,
 			KGSL_IOCTL_LOCK | KGSL_IOCTL_WAKE),
@@ -2392,6 +3185,8 @@ static const struct {
 			kgsl_ioctl_gpumem_get_info, 0),
 	KGSL_IOCTL_FUNC(IOCTL_KGSL_GPUMEM_SYNC_CACHE,
 			kgsl_ioctl_gpumem_sync_cache, 0),
+	KGSL_IOCTL_FUNC(IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK,
+			kgsl_ioctl_gpumem_sync_cache_bulk, 0),
 };
 
 static long kgsl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
@@ -2472,14 +3267,19 @@ static long kgsl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 
 	if (lock) {
 		mutex_lock(&dev_priv->device->mutex);
-		if (use_hw)
-			kgsl_check_suspended(dev_priv->device);
+		if (use_hw) {
+			ret = kgsl_active_count_get(dev_priv->device);
+			if (ret < 0)
+				goto unlock;
+		}
 	}
 
 	ret = func(dev_priv, cmd, uptr);
 
+unlock:
 	if (lock) {
-		kgsl_check_idle_locked(dev_priv->device);
+		if (use_hw)
+			kgsl_active_count_put(dev_priv->device);
 		mutex_unlock(&dev_priv->device->mutex);
 	}
 
@@ -2567,21 +3367,17 @@ get_mmap_entry(struct kgsl_process_private *private,
 		struct kgsl_mem_entry **out_entry, unsigned long pgoff,
 		unsigned long len)
 {
-	int ret = -EINVAL;
+	int ret = 0;
 	struct kgsl_mem_entry *entry;
 
 	entry = kgsl_sharedmem_find_id(private, pgoff);
 	if (entry == NULL) {
-		spin_lock(&private->mem_lock);
 		entry = kgsl_sharedmem_find(private, pgoff << PAGE_SHIFT);
-		spin_unlock(&private->mem_lock);
 	}
 
 	if (!entry)
 		return -EINVAL;
 
-	kgsl_mem_entry_get(entry);
-
 	if (!entry->memdesc.ops ||
 		!entry->memdesc.ops->vmflags ||
 		!entry->memdesc.ops->vmfault) {
@@ -2606,12 +3402,18 @@ err_put:
 	return ret;
 }
 
+static inline bool
+mmap_range_valid(unsigned long addr, unsigned long len)
+{
+	return (addr + len) > addr && (addr + len) < TASK_SIZE;
+}
+
 static unsigned long
 kgsl_get_unmapped_area(struct file *file, unsigned long addr,
 			unsigned long len, unsigned long pgoff,
 			unsigned long flags)
 {
-	unsigned long ret = 0;
+	unsigned long ret = 0, orig_len = len;
 	unsigned long vma_offset = pgoff << PAGE_SHIFT;
 	struct kgsl_device_private *dev_priv = file->private_data;
 	struct kgsl_process_private *private = dev_priv->process_priv;
@@ -2656,10 +3458,26 @@ kgsl_get_unmapped_area(struct file *file, unsigned long addr,
 
 	if (align)
 		len += 1 << align;
+
+	if (!mmap_range_valid(addr, len))
+		addr = 0;
 	do {
 		ret = get_unmapped_area(NULL, addr, len, pgoff, flags);
-		if (IS_ERR_VALUE(ret))
+		if (IS_ERR_VALUE(ret)) {
+			/*
+			 * If we are really fragmented, there may not be room
+			 * for the alignment padding, so try again without it.
+			 */
+			if (!retry && (ret == (unsigned long)-ENOMEM)
+				&& (align > PAGE_SHIFT)) {
+				align = PAGE_SHIFT;
+				addr = 0;
+				len = orig_len;
+				retry = 1;
+				continue;
+			}
 			break;
+		}
 		if (align)
 			ret = ALIGN(ret, (1 << align));
 
@@ -2681,13 +3499,13 @@ kgsl_get_unmapped_area(struct file *file, unsigned long addr,
 		 * the whole address space at least once by wrapping
 		 * back around once.
 		 */
-		if (!retry && (addr + len >= TASK_SIZE)) {
+		if (!retry && !mmap_range_valid(addr, len)) {
 			addr = 0;
 			retry = 1;
 		} else {
 			ret = -EBUSY;
 		}
-	} while (addr + len < TASK_SIZE);
+	} while (mmap_range_valid(addr, len));
 
 	if (IS_ERR_VALUE(ret))
 		KGSL_MEM_INFO(device,
@@ -2712,6 +3530,10 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma_offset == device->memstore.gpuaddr)
 		return kgsl_mmap_memstore(device, vma);
 
+	/*
+	 * The reference count on the entry that we get from
+	 * get_mmap_entry() will be held until kgsl_gpumem_vm_close().
+	 */
 	ret = get_mmap_entry(private, &entry, vma->vm_pgoff,
 				vma->vm_end - vma->vm_start);
 	if (ret)
@@ -2720,8 +3542,7 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	if (kgsl_memdesc_use_cpu_map(&entry->memdesc)) {
 		entry->memdesc.gpuaddr = vma->vm_start;
 
-		ret = kgsl_mmu_map(private->pagetable, &entry->memdesc,
-				   kgsl_memdesc_protflags(&entry->memdesc));
+		ret = kgsl_mmu_map(private->pagetable, &entry->memdesc);
 		if (ret) {
 			kgsl_mem_entry_put(entry);
 			return ret;
@@ -2762,10 +3583,6 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 		int sglen = entry->memdesc.sglen;
 		unsigned long addr = vma->vm_start;
 
-		/* don't map in the guard page, it should always fault */
-		if (kgsl_memdesc_has_guard_page(&entry->memdesc))
-			sglen--;
-
 		for_each_sg(entry->memdesc.sg, s, sglen, i) {
 			int j;
 			for (j = 0; j < (sg_dma_len(s) >> PAGE_SHIFT); j++) {
@@ -2782,7 +3599,6 @@ static int kgsl_mmap(struct file *file, struct vm_area_struct *vma)
 	entry->memdesc.useraddr = vma->vm_start;
 
 	trace_kgsl_mem_mmap(entry);
-
 	return 0;
 }
 
@@ -2809,6 +3625,11 @@ struct kgsl_driver kgsl_driver  = {
 	.devlock = __MUTEX_INITIALIZER(kgsl_driver.devlock),
 	.memfree_hist_mutex =
 		__MUTEX_INITIALIZER(kgsl_driver.memfree_hist_mutex),
+	/*
+	 * Full cache flushes are faster than line by line on at least
+	 * 8064 and 8974 once the region to be flushed is > 16mb.
+	 */
+	.full_cache_threshold = SZ_16M,
 };
 EXPORT_SYMBOL(kgsl_driver);
 
@@ -2949,11 +3770,12 @@ int kgsl_device_platform_probe(struct kgsl_device *device)
 		device->id, device->reg_phys, device->reg_len,
 		device->reg_virt);
 
+	rwlock_init(&device->context_lock);
+
 	result = kgsl_drm_init(pdev);
 	if (result)
 		goto error_pwrctrl_close;
 
-	kgsl_cffdump_open(device->id);
 
 	setup_timer(&device->idle_timer, kgsl_timer, (unsigned long) device);
 	status = kgsl_create_device_workqueue(device);
@@ -3011,11 +3833,7 @@ int kgsl_postmortem_dump(struct kgsl_device *device, int manual)
 	/* For a manual dump, make sure that the system is idle */
 
 	if (manual) {
-		if (device->active_cnt != 0) {
-			mutex_unlock(&device->mutex);
-			wait_for_completion(&device->suspend_gate);
-			mutex_lock(&device->mutex);
-		}
+		kgsl_active_count_wait(device);
 
 		if (device->state == KGSL_STATE_ACTIVE)
 			kgsl_idle(device);
@@ -3035,9 +3853,6 @@ int kgsl_postmortem_dump(struct kgsl_device *device, int manual)
 
 	/* Disable the idle timer so we don't get interrupted */
 	del_timer_sync(&device->idle_timer);
-	mutex_unlock(&device->mutex);
-	flush_workqueue(device->work_queue);
-	mutex_lock(&device->mutex);
 
 	/* Turn off napping to make sure we have the clocks full
 	   attention through the following process */
@@ -3077,7 +3892,6 @@ void kgsl_device_platform_remove(struct kgsl_device *device)
 {
 	kgsl_device_snapshot_close(device);
 
-	kgsl_cffdump_close(device->id);
 	kgsl_pwrctrl_uninit_sysfs(device);
 
 	pm_qos_remove_request(&device->pm_qos_req_dma);
diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h
index 30ac1a9602f6ae331f3265e59a5c978a4c67a17d..458400d412fdcdb036e1680dc10bea883b9ed47e 100644
--- a/drivers/gpu/msm/kgsl.h
+++ b/drivers/gpu/msm/kgsl.h
@@ -130,12 +130,14 @@ struct kgsl_driver {
 		unsigned int mapped_max;
 		unsigned int histogram[16];
 	} stats;
+	unsigned int full_cache_threshold;
 };
 
 extern struct kgsl_driver kgsl_driver;
 
 struct kgsl_pagetable;
 struct kgsl_memdesc;
+struct kgsl_cmdbatch;
 
 struct kgsl_memdesc_ops {
 	int (*vmflags)(struct kgsl_memdesc *);
@@ -149,6 +151,8 @@ struct kgsl_memdesc_ops {
 #define KGSL_MEMDESC_GUARD_PAGE BIT(0)
 /* Set if the memdesc is mapped into all pagetables */
 #define KGSL_MEMDESC_GLOBAL BIT(1)
+/* The memdesc is frozen during a snapshot */
+#define KGSL_MEMDESC_FROZEN BIT(2)
 
 /* shared memory allocation */
 struct kgsl_memdesc {
@@ -175,15 +179,10 @@ struct kgsl_memdesc {
 #define KGSL_MEM_ENTRY_ION    4
 #define KGSL_MEM_ENTRY_MAX    5
 
-/* List of flags */
-
-#define KGSL_MEM_ENTRY_FROZEN (1 << 0)
-
 struct kgsl_mem_entry {
 	struct kref refcount;
 	struct kgsl_memdesc memdesc;
 	int memtype;
-	int flags;
 	void *priv_data;
 	struct rb_node node;
 	unsigned int id;
@@ -229,6 +228,14 @@ int kgsl_resume_driver(struct platform_device *pdev);
 void kgsl_early_suspend_driver(struct early_suspend *h);
 void kgsl_late_resume_driver(struct early_suspend *h);
 
+void kgsl_trace_regwrite(struct kgsl_device *device, unsigned int offset,
+		unsigned int value);
+
+void kgsl_trace_issueibcmds(struct kgsl_device *device, int id,
+		struct kgsl_cmdbatch *cmdbatch,
+		unsigned int timestamp, unsigned int flags,
+		int result, unsigned int type);
+
 #ifdef CONFIG_MSM_KGSL_DRM
 extern int kgsl_drm_init(struct platform_device *dev);
 extern void kgsl_drm_exit(void);
@@ -246,6 +253,10 @@ static inline void kgsl_drm_exit(void)
 static inline int kgsl_gpuaddr_in_memdesc(const struct kgsl_memdesc *memdesc,
 				unsigned int gpuaddr, unsigned int size)
 {
+	/* set a minimum size to search for */
+	if (!size)
+		size = 1;
+
 	/* don't overflow */
 	if ((gpuaddr + size) < gpuaddr)
 		return 0;
diff --git a/drivers/gpu/msm/kgsl_cffdump.c b/drivers/gpu/msm/kgsl_cffdump.c
index e06c94d6b9fbf083e9084bccdb5f851e70812313..44f6e52fa2361e96c553e16c138883dbca231e4e 100644
--- a/drivers/gpu/msm/kgsl_cffdump.c
+++ b/drivers/gpu/msm/kgsl_cffdump.c
@@ -28,6 +28,7 @@
 #include "kgsl_log.h"
 #include "kgsl_sharedmem.h"
 #include "adreno_pm4types.h"
+#include "adreno.h"
 
 static struct rchan	*chan;
 static struct dentry	*dir;
@@ -334,7 +335,7 @@ void kgsl_cffdump_init()
 		return;
 	}
 
-	kgsl_cff_dump_enable = 1;
+	kgsl_cff_dump_enable = 0;
 
 	spin_lock_init(&cffdump_lock);
 
@@ -356,60 +357,71 @@ void kgsl_cffdump_destroy()
 		debugfs_remove(dir);
 }
 
-void kgsl_cffdump_open(enum kgsl_deviceid device_id)
+void kgsl_cffdump_open(struct kgsl_device *device)
 {
-	kgsl_cffdump_memory_base(device_id, KGSL_PAGETABLE_BASE,
-			kgsl_mmu_get_ptsize(), SZ_256K);
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	if (!kgsl_cff_dump_enable)
+		return;
+
+	if (KGSL_MMU_TYPE_IOMMU == kgsl_mmu_get_mmutype()) {
+		kgsl_cffdump_memory_base(device->id,
+			kgsl_mmu_get_base_addr(&device->mmu),
+			kgsl_mmu_get_ptsize(&device->mmu) +
+			KGSL_IOMMU_GLOBAL_MEM_SIZE, adreno_dev->gmem_size);
+	} else {
+		kgsl_cffdump_memory_base(device->id,
+			kgsl_mmu_get_base_addr(&device->mmu),
+			kgsl_mmu_get_ptsize(&device->mmu),
+			adreno_dev->gmem_size);
+	}
 }
 
 void kgsl_cffdump_memory_base(enum kgsl_deviceid device_id, unsigned int base,
 			      unsigned int range, unsigned gmemsize)
 {
+	if (!kgsl_cff_dump_enable)
+		return;
 	cffdump_printline(device_id, CFF_OP_MEMORY_BASE, base,
 			range, gmemsize, 0, 0);
 }
 
 void kgsl_cffdump_hang(enum kgsl_deviceid device_id)
 {
+	if (!kgsl_cff_dump_enable)
+		return;
 	cffdump_printline(device_id, CFF_OP_HANG, 0, 0, 0, 0, 0);
 }
 
 void kgsl_cffdump_close(enum kgsl_deviceid device_id)
 {
+	if (!kgsl_cff_dump_enable)
+		return;
 	cffdump_printline(device_id, CFF_OP_EOF, 0, 0, 0, 0, 0);
 }
 
+
 void kgsl_cffdump_user_event(unsigned int cff_opcode, unsigned int op1,
 		unsigned int op2, unsigned int op3,
 		unsigned int op4, unsigned int op5)
 {
+	if (!kgsl_cff_dump_enable)
+		return;
 	cffdump_printline(-1, cff_opcode, op1, op2, op3, op4, op5);
 }
 
-void kgsl_cffdump_syncmem(struct kgsl_device_private *dev_priv,
-	const struct kgsl_memdesc *memdesc, uint gpuaddr, uint sizebytes,
-	bool clean_cache)
+void kgsl_cffdump_syncmem(struct kgsl_device *device,
+			  struct kgsl_memdesc *memdesc, uint gpuaddr,
+			  uint sizebytes, bool clean_cache)
 {
 	const void *src;
 
 	if (!kgsl_cff_dump_enable)
 		return;
 
+	BUG_ON(memdesc == NULL);
+
 	total_syncmem += sizebytes;
 
-	if (memdesc == NULL) {
-		struct kgsl_mem_entry *entry;
-		spin_lock(&dev_priv->process_priv->mem_lock);
-		entry = kgsl_sharedmem_find_region(dev_priv->process_priv,
-			gpuaddr, sizebytes);
-		spin_unlock(&dev_priv->process_priv->mem_lock);
-		if (entry == NULL) {
-			KGSL_CORE_ERR("did not find mapping "
-				"for gpuaddr: 0x%08x\n", gpuaddr);
-			return;
-		}
-		memdesc = &entry->memdesc;
-	}
 	src = (uint *)kgsl_gpuaddr_to_vaddr(memdesc, gpuaddr);
 	if (memdesc->hostptr == NULL) {
 		KGSL_CORE_ERR("no kernel mapping for "
@@ -522,7 +534,7 @@ static int subbuf_start_handler(struct rchan_buf *buf,
 }
 
 static struct dentry *create_buf_file_handler(const char *filename,
-	struct dentry *parent, int mode, struct rchan_buf *buf,
+	struct dentry *parent, unsigned short mode, struct rchan_buf *buf,
 	int *is_global)
 {
 	return debugfs_create_file(filename, mode, parent, buf,
diff --git a/drivers/gpu/msm/kgsl_cffdump.h b/drivers/gpu/msm/kgsl_cffdump.h
index 2733cc3fab8056bf9643d33625ef4364cd552e62..83695f81c66f34e73cdc0e8da387382e623611fd 100644
--- a/drivers/gpu/msm/kgsl_cffdump.h
+++ b/drivers/gpu/msm/kgsl_cffdump.h
@@ -22,10 +22,10 @@
 
 void kgsl_cffdump_init(void);
 void kgsl_cffdump_destroy(void);
-void kgsl_cffdump_open(enum kgsl_deviceid device_id);
-void kgsl_cffdump_close(enum kgsl_deviceid device_id);
-void kgsl_cffdump_syncmem(struct kgsl_device_private *dev_priv,
-	const struct kgsl_memdesc *memdesc, uint physaddr, uint sizebytes,
+void kgsl_cffdump_open(struct kgsl_device *device);
+void kgsl_cffdump_close(struct kgsl_device *device);
+void kgsl_cffdump_syncmem(struct kgsl_device *,
+	struct kgsl_memdesc *memdesc, uint physaddr, uint sizebytes,
 	bool clean_cache);
 void kgsl_cffdump_setmem(uint addr, uint value, uint sizebytes);
 void kgsl_cffdump_regwrite(enum kgsl_deviceid device_id, uint addr,
@@ -49,7 +49,7 @@ void kgsl_cffdump_hang(enum kgsl_deviceid device_id);
 
 #define kgsl_cffdump_init()					(void)0
 #define kgsl_cffdump_destroy()					(void)0
-#define kgsl_cffdump_open(device_id)				(void)0
+#define kgsl_cffdump_open(device)				(void)0
 #define kgsl_cffdump_close(device_id)				(void)0
 #define kgsl_cffdump_syncmem(dev_priv, memdesc, addr, sizebytes, clean_cache) \
 	(void) 0
diff --git a/drivers/gpu/msm/kgsl_debugfs.c b/drivers/gpu/msm/kgsl_debugfs.c
index a2490ec80fdbf575c3dbd897f586fc6ec2b5b9c0..09c9dfe6256b8f0670980424ed03ef8eedccf51f 100644
--- a/drivers/gpu/msm/kgsl_debugfs.c
+++ b/drivers/gpu/msm/kgsl_debugfs.c
@@ -123,7 +123,6 @@ KGSL_DEBUGFS_LOG(cmd_log);
 KGSL_DEBUGFS_LOG(ctxt_log);
 KGSL_DEBUGFS_LOG(mem_log);
 KGSL_DEBUGFS_LOG(pwr_log);
-KGSL_DEBUGFS_LOG(ft_log);
 
 static int memfree_hist_print(struct seq_file *s, void *unused)
 {
@@ -185,7 +184,6 @@ void kgsl_device_debugfs_init(struct kgsl_device *device)
 	device->drv_log = KGSL_LOG_LEVEL_DEFAULT;
 	device->mem_log = KGSL_LOG_LEVEL_DEFAULT;
 	device->pwr_log = KGSL_LOG_LEVEL_DEFAULT;
-	device->ft_log = KGSL_LOG_LEVEL_DEFAULT;
 
 	debugfs_create_file("log_level_cmd", 0644, device->d_debugfs, device,
 			    &cmd_log_fops);
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index c48644b7d9e8c5d2f008a23f7801b0d801bc2732..fb3fade51e4292ebd8134c07812727a5351ebe96 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -13,9 +13,11 @@
 #ifndef __KGSL_DEVICE_H
 #define __KGSL_DEVICE_H
 
+#include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/pm_qos.h>
 #include <linux/earlysuspend.h>
+#include <linux/sched.h>
 
 #include "kgsl.h"
 #include "kgsl_mmu.h"
@@ -62,12 +64,21 @@
 #define KGSL_EVENT_TIMESTAMP_RETIRED 0
 #define KGSL_EVENT_CANCELLED 1
 
+/*
+ * "list" of event types for ftrace symbolic magic
+ */
+
+#define KGSL_EVENT_TYPES \
+	{ KGSL_EVENT_TIMESTAMP_RETIRED, "retired" }, \
+	{ KGSL_EVENT_CANCELLED, "cancelled" }
+
 struct kgsl_device;
 struct platform_device;
 struct kgsl_device_private;
 struct kgsl_context;
 struct kgsl_power_stats;
 struct kgsl_event;
+struct kgsl_cmdbatch;
 
 struct kgsl_functable {
 	/* Mandatory functions - these functions must be implemented
@@ -79,9 +90,10 @@ struct kgsl_functable {
 	void (*regwrite) (struct kgsl_device *device,
 		unsigned int offsetwords, unsigned int value);
 	int (*idle) (struct kgsl_device *device);
-	unsigned int (*isidle) (struct kgsl_device *device);
+	bool (*isidle) (struct kgsl_device *device);
 	int (*suspend_context) (struct kgsl_device *device);
-	int (*start) (struct kgsl_device *device, unsigned int init_ram);
+	int (*init) (struct kgsl_device *device);
+	int (*start) (struct kgsl_device *device);
 	int (*stop) (struct kgsl_device *device);
 	int (*getproperty) (struct kgsl_device *device,
 		enum kgsl_property_type type, void *value,
@@ -92,9 +104,8 @@ struct kgsl_functable {
 	unsigned int (*readtimestamp) (struct kgsl_device *device,
 		struct kgsl_context *context, enum kgsl_timestamp_type type);
 	int (*issueibcmds) (struct kgsl_device_private *dev_priv,
-		struct kgsl_context *context, struct kgsl_ibdesc *ibdesc,
-		unsigned int sizedwords, uint32_t *timestamp,
-		unsigned int flags);
+		struct kgsl_context *context, struct kgsl_cmdbatch *cmdbatch,
+		uint32_t *timestamps);
 	int (*setup_pt)(struct kgsl_device *device,
 		struct kgsl_pagetable *pagetable);
 	void (*cleanup_pt)(struct kgsl_device *device,
@@ -106,16 +117,16 @@ struct kgsl_functable {
 	void * (*snapshot)(struct kgsl_device *device, void *snapshot,
 		int *remain, int hang);
 	irqreturn_t (*irq_handler)(struct kgsl_device *device);
+	int (*drain)(struct kgsl_device *device);
 	/* Optional functions - these functions are not mandatory.  The
 	   driver will check that the function pointer is not NULL before
 	   calling the hook */
-	void (*setstate) (struct kgsl_device *device, unsigned int context_id,
+	int (*setstate) (struct kgsl_device *device, unsigned int context_id,
 			uint32_t flags);
-	int (*drawctxt_create) (struct kgsl_device *device,
-		struct kgsl_pagetable *pagetable, struct kgsl_context *context,
-		uint32_t *flags);
-	void (*drawctxt_destroy) (struct kgsl_device *device,
-		struct kgsl_context *context);
+	struct kgsl_context *(*drawctxt_create) (struct kgsl_device_private *,
+						uint32_t *flags);
+	int (*drawctxt_detach) (struct kgsl_context *context);
+	void (*drawctxt_destroy) (struct kgsl_context *context);
 	long (*ioctl) (struct kgsl_device_private *dev_priv,
 		unsigned int cmd, void *data);
 	int (*setproperty) (struct kgsl_device *device,
@@ -124,6 +135,8 @@ struct kgsl_functable {
 	int (*postmortem_dump) (struct kgsl_device *device, int manual);
 	int (*next_event)(struct kgsl_device *device,
 		struct kgsl_event *event);
+	void (*drawctxt_sched)(struct kgsl_device *device,
+		struct kgsl_context *context);
 };
 
 /* MH register values */
@@ -147,6 +160,46 @@ struct kgsl_event {
 	unsigned int created;
 };
 
+/**
+ * struct kgsl_cmdbatch - KGSl command descriptor
+ * @device: KGSL GPU device that the command was created for
+ * @context: KGSL context that created the command
+ * @timestamp: Timestamp assigned to the command
+ * @flags: flags
+ * @priv: Internal flags
+ * @fault_policy: Internal policy describing how to handle this command in case
+ * of a fault
+ * @ibcount: Number of IBs in the command list
+ * @ibdesc: Pointer to the list of IBs
+ * @expires: Point in time when the cmdbatch is considered to be hung
+ * @invalid:  non-zero if the dispatcher determines the command and the owning
+ * context should be invalidated
+ * @refcount: kref structure to maintain the reference count
+ * @synclist: List of context/timestamp tuples to wait for before issuing
+ *
+ * This struture defines an atomic batch of command buffers issued from
+ * userspace.
+ */
+struct kgsl_cmdbatch {
+	struct kgsl_device *device;
+	struct kgsl_context *context;
+	spinlock_t lock;
+	uint32_t timestamp;
+	uint32_t flags;
+	uint32_t priv;
+	uint32_t fault_policy;
+	uint32_t ibcount;
+	struct kgsl_ibdesc *ibdesc;
+	unsigned long expires;
+	int invalid;
+	struct kref refcount;
+	struct list_head synclist;
+};
+
+/* Internal cmdbatch flags */
+
+#define CMDBATCH_FLAG_SKIP BIT(0)
+#define CMDBATCH_FLAG_FORCE_PREAMBLE BIT(1)
 
 struct kgsl_device {
 	struct device *dev;
@@ -174,16 +227,16 @@ struct kgsl_device {
 	uint32_t state;
 	uint32_t requested_state;
 
-	unsigned int active_cnt;
+	atomic_t active_cnt;
 	struct completion suspend_gate;
 
 	wait_queue_head_t wait_queue;
 	struct workqueue_struct *work_queue;
 	struct device *parentdev;
-	struct completion ft_gate;
 	struct dentry *d_debugfs;
 	struct idr context_idr;
 	struct early_suspend display_off;
+	rwlock_t context_lock;
 
 	void *snapshot;		/* Pointer to the snapshot memory region */
 	int snapshot_maxsize;   /* Max size of the snapshot region */
@@ -206,7 +259,6 @@ struct kgsl_device {
 	int drv_log;
 	int mem_log;
 	int pwr_log;
-	int ft_log;
 	int pm_dump_enable;
 	struct kgsl_pwrscale pwrscale;
 	struct kobject pwrscale_kobj;
@@ -214,6 +266,7 @@ struct kgsl_device {
 	struct work_struct ts_expired_ws;
 	struct list_head events;
 	struct list_head events_pending_list;
+	unsigned int events_last_timestamp;
 	s64 on_time;
 
 	/* Postmortem Control switches */
@@ -229,7 +282,6 @@ void kgsl_check_fences(struct work_struct *work);
 #define KGSL_DEVICE_COMMON_INIT(_dev) \
 	.hwaccess_gate = COMPLETION_INITIALIZER((_dev).hwaccess_gate),\
 	.suspend_gate = COMPLETION_INITIALIZER((_dev).suspend_gate),\
-	.ft_gate = COMPLETION_INITIALIZER((_dev).ft_gate),\
 	.idle_check_ws = __WORK_INITIALIZER((_dev).idle_check_ws,\
 			kgsl_idle_check),\
 	.ts_expired_ws  = __WORK_INITIALIZER((_dev).ts_expired_ws,\
@@ -244,37 +296,56 @@ void kgsl_check_fences(struct work_struct *work);
 	.ver_minor = DRIVER_VERSION_MINOR
 
 
+/* bits for struct kgsl_context.priv */
+/* the context has been destroyed by userspace and is no longer using the gpu */
+#define KGSL_CONTEXT_DETACHED 0
+/* the context has caused a pagefault */
+#define KGSL_CONTEXT_PAGEFAULT 1
+
 /**
  * struct kgsl_context - Master structure for a KGSL context object
- * @refcount - kref object for reference counting the context
- * @id - integer identifier for the context
- * @dev_priv - pointer to the owning device instance
- * @devctxt - pointer to the device specific context information
- * @reset_status - status indication whether a gpu reset occured and whether
+ * @refcount: kref object for reference counting the context
+ * @id: integer identifier for the context
+ * @priv: in-kernel context flags, use KGSL_CONTEXT_* values
+ * @dev_priv: pointer to the owning device instance
+ * @reset_status: status indication whether a gpu reset occured and whether
  * this context was responsible for causing it
- * @wait_on_invalid_ts - flag indicating if this context has tried to wait on a
+ * @wait_on_invalid_ts: flag indicating if this context has tried to wait on a
  * bad timestamp
- * @timeline - sync timeline used to create fences that can be signaled when a
+ * @timeline: sync timeline used to create fences that can be signaled when a
  * sync_pt timestamp expires
- * @events - list head of pending events for this context
- * @events_list - list node for the list of all contexts that have pending events
+ * @events: list head of pending events for this context
+ * @events_list: list node for the list of all contexts that have pending events
+ * @pid: process that owns this context.
+ * @pagefault: flag set if this context caused a pagefault.
+ * @pagefault_ts: global timestamp of the pagefault, if KGSL_CONTEXT_PAGEFAULT
+ * is set.
  */
 struct kgsl_context {
 	struct kref refcount;
 	uint32_t id;
-	struct kgsl_device_private *dev_priv;
-	void *devctxt;
+	pid_t pid;
+	unsigned long priv;
+	struct kgsl_device *device;
+	struct kgsl_pagetable *pagetable;
 	unsigned int reset_status;
 	bool wait_on_invalid_ts;
 	struct sync_timeline *timeline;
 	struct list_head events;
 	struct list_head events_list;
+	unsigned int pagefault_ts;
 };
 
 struct kgsl_process_private {
 	unsigned int refcnt;
 	pid_t pid;
 	spinlock_t mem_lock;
+
+	/* General refcount for process private struct obj */
+	struct kref refcount;
+	/* Mutex to synchronize access to each process_private struct obj */
+	struct mutex process_private_mutex;
+
 	struct rb_root mem_rb;
 	struct idr mem_idr;
 	struct kgsl_pagetable *pagetable;
@@ -303,6 +374,9 @@ struct kgsl_device *kgsl_get_device(int dev_idx);
 int kgsl_add_event(struct kgsl_device *device, u32 id, u32 ts,
 	kgsl_event_func func, void *priv, void *owner);
 
+void kgsl_cancel_event(struct kgsl_device *device, struct kgsl_context *context,
+		unsigned int timestamp, kgsl_event_func func, void *priv);
+
 static inline void kgsl_process_add_stats(struct kgsl_process_private *priv,
 	unsigned int type, size_t size)
 {
@@ -390,8 +464,6 @@ static inline int kgsl_create_device_workqueue(struct kgsl_device *device)
 	return 0;
 }
 
-
-
 int kgsl_check_timestamp(struct kgsl_device *device,
 		struct kgsl_context *context, unsigned int timestamp);
 
@@ -416,10 +488,15 @@ kgsl_device_get_drvdata(struct kgsl_device *dev)
 
 void kgsl_context_destroy(struct kref *kref);
 
+int kgsl_context_init(struct kgsl_device_private *, struct kgsl_context
+		*context);
+
 /**
- * kgsl_context_put - Release context reference count
- * @context
+ * kgsl_context_put() - Release context reference count
+ * @context: Pointer to the KGSL context to be released
  *
+ * Reduce the reference count on a KGSL context and destroy it if it is no
+ * longer needed
  */
 static inline void
 kgsl_context_put(struct kgsl_context *context)
@@ -427,10 +504,26 @@ kgsl_context_put(struct kgsl_context *context)
 	if (context)
 		kref_put(&context->refcount, kgsl_context_destroy);
 }
+
 /**
- * kgsl_context_get - get a pointer to a KGSL context
- * @devicex - Pointer to the KGSL device that owns the context
- * @id - Context ID to return
+ * kgsl_context_detached() - check if a context is detached
+ * @context: the context
+ *
+ * Check if a context has been destroyed by userspace and is only waiting
+ * for reference counts to go away. This check is used to weed out
+ * contexts that shouldn't use the gpu, so NULL is considered detached.
+ */
+static inline bool kgsl_context_detached(struct kgsl_context *context)
+{
+	return (context == NULL || test_bit(KGSL_CONTEXT_DETACHED,
+						&context->priv));
+}
+
+
+/**
+ * kgsl_context_get() - get a pointer to a KGSL context
+ * @device: Pointer to the KGSL device that owns the context
+ * @id: Context ID
  *
  * Find the context associated with the given ID number, increase the reference
  * count on it and return it.  The caller must make sure that this call is
@@ -438,26 +531,45 @@ kgsl_context_put(struct kgsl_context *context)
  * doesn't validate the ownership of the context with the calling process - use
  * kgsl_context_get_owner for that
  */
-
 static inline struct kgsl_context *kgsl_context_get(struct kgsl_device *device,
 		uint32_t id)
 {
 	struct kgsl_context *context = NULL;
 
-	rcu_read_lock();
+	read_lock(&device->context_lock);
+
 	context = idr_find(&device->context_idr, id);
 
-	if (context)
+	/* Don't return a context that has been detached */
+	if (kgsl_context_detached(context))
+		context = NULL;
+	else
 		kref_get(&context->refcount);
 
-	rcu_read_unlock();
+	read_unlock(&device->context_lock);
+
 	return context;
 }
 
 /**
- * kgsl_context_get_owner - get a pointer to a KGSL context
- * @dev_priv - Pointer to the owner of the requesting process
- * @id - Context ID to return
+* _kgsl_context_get() - lightweight function to just increment the ref count
+* @context: Pointer to the KGSL context
+*
+* Get a reference to the specified KGSL context structure. This is a
+* lightweight way to just increase the refcount on a known context rather than
+* walking through kgsl_context_get and searching the iterator
+*/
+static inline void _kgsl_context_get(struct kgsl_context *context)
+{
+	if (context)
+		kref_get(&context->refcount);
+}
+
+/**
+ * kgsl_context_get_owner() - get a pointer to a KGSL context in a specific
+ * process
+ * @dev_priv: Pointer to the process struct
+ * @id: Context ID to return
  *
  * Find the context associated with the given ID number, increase the reference
  * count on it and return it.  The caller must make sure that this call is
@@ -472,8 +584,8 @@ static inline struct kgsl_context *kgsl_context_get_owner(
 
 	context = kgsl_context_get(dev_priv->device, id);
 
-	/* Verify that the context belongs to the dev_priv instance */
-	if (context && context->dev_priv != dev_priv) {
+	/* Verify that the context belongs to current calling process. */
+	if (context != NULL && context->pid != dev_priv->process_priv->pid) {
 		kgsl_context_put(context);
 		return NULL;
 	}
@@ -482,24 +594,12 @@ static inline struct kgsl_context *kgsl_context_get_owner(
 }
 
 /**
- * kgsl_active_count_put - Decrease the device active count
- * @device: Pointer to a KGSL device
+ * kgsl_context_cancel_events() - Cancel all events for a context
+ * @device:  Pointer to the KGSL device structure for the GPU
+ * @context: Pointer to the KGSL context
  *
- * Decrease the active count for the KGSL device and trigger the suspend_gate
- * completion if it hits zero
+ * Signal all pending events on the context with KGSL_EVENT_CANCELLED
  */
-static inline void
-kgsl_active_count_put(struct kgsl_device *device)
-{
-	if (device->active_cnt == 1)
-		INIT_COMPLETION(device->suspend_gate);
-
-	device->active_cnt--;
-
-	if (device->active_cnt == 0)
-		complete(&device->suspend_gate);
-}
-
 static inline void kgsl_context_cancel_events(struct kgsl_device *device,
 	struct kgsl_context *context)
 {
@@ -507,9 +607,9 @@ static inline void kgsl_context_cancel_events(struct kgsl_device *device,
 }
 
 /**
- * kgsl_context_cancel_events_timestamp - cancel events for a given timestamp
+ * kgsl_context_cancel_events_timestamp() - cancel events for a given timestamp
  * @device: Pointer to the KGSL device that owns the context
- * @cotnext: Pointer to the context that owns the event or NULL for global
+ * @context: Pointer to the context that owns the event or NULL for global
  * @timestamp: Timestamp to cancel events for
  *
  * Cancel events pending for a specific timestamp
@@ -519,4 +619,30 @@ static inline void kgsl_cancel_events_timestamp(struct kgsl_device *device,
 {
 	kgsl_signal_event(device, context, timestamp, KGSL_EVENT_CANCELLED);
 }
+
+void kgsl_cmdbatch_destroy(struct kgsl_cmdbatch *cmdbatch);
+
+void kgsl_cmdbatch_destroy_object(struct kref *kref);
+
+/**
+ * kgsl_cmdbatch_put() - Decrement the refcount for a command batch object
+ * @cmdbatch: Pointer to the command batch object
+ */
+static inline void kgsl_cmdbatch_put(struct kgsl_cmdbatch *cmdbatch)
+{
+	kref_put(&cmdbatch->refcount, kgsl_cmdbatch_destroy_object);
+}
+
+/**
+ * kgsl_cmdbatch_sync_pending() - return true if the cmdbatch is waiting
+ * @cmdbatch: Pointer to the command batch object to check
+ *
+ * Return non-zero if the specified command batch is still waiting for sync
+ * point dependencies to be satisfied
+ */
+static inline int kgsl_cmdbatch_sync_pending(struct kgsl_cmdbatch *cmdbatch)
+{
+	return list_empty(&cmdbatch->synclist) ? 0 : 1;
+}
+
 #endif  /* __KGSL_DEVICE_H */
diff --git a/drivers/gpu/msm/kgsl_events.c b/drivers/gpu/msm/kgsl_events.c
index dc49faa07e5e089b4388cc6d1597e91b871300a2..e8c6c5d8a8e64305bbb459bc50e1b46d1aa8c05a 100644
--- a/drivers/gpu/msm/kgsl_events.c
+++ b/drivers/gpu/msm/kgsl_events.c
@@ -90,12 +90,12 @@ static struct kgsl_event *_find_event(struct kgsl_device *device,
 }
 
 /**
- * _signal_event - send a signal to a specific event in the list
- * @device - KGSL device
- * @head - Pointer to the event list to process
- * @timestamp - timestamp of the event to signal
- * @cur - timestamp value to send to the callback
- * @type - Signal ID to send to the callback
+ * _signal_event() - send a signal to a specific event in the list
+ * @device: Pointer to the KGSL device struct
+ * @head: Pointer to the event list to process
+ * @timestamp: timestamp of the event to signal
+ * @cur: timestamp value to send to the callback
+ * @type: Signal ID to send to the callback
  *
  * Send the specified signal to the events in the list with the specified
  * timestamp. The timestamp 'cur' is sent to the callback so it knows
@@ -114,12 +114,12 @@ static void _signal_event(struct kgsl_device *device,
 }
 
 /**
- * _signal_events - send a signal to all the events in a list
- * @device - KGSL device
- * @head - Pointer to the event list to process
- * @timestamp - Timestamp to pass to the events (this should be the current
+ * _signal_events() - send a signal to all the events in a list
+ * @device: Pointer to the KGSL device struct
+ * @head: Pointer to the event list to process
+ * @timestamp: Timestamp to pass to the events (this should be the current
  * timestamp when the signal is sent)
- * @type - Signal ID to send to the callback
+ * @type: Signal ID to send to the callback
  *
  * Send the specified signal to all the events in the list and destroy them
  */
@@ -134,6 +134,16 @@ static void _signal_events(struct kgsl_device *device,
 
 }
 
+/**
+ * kgsl_signal_event() - send a signal to a specific event in the context
+ * @device: Pointer to the KGSL device struct
+ * @context: Pointer to the KGSL context
+ * @timestamp: Timestamp of the event to signal
+ * @type: Signal ID to send to the callback
+ *
+ * Send the specified signal to all the events in the context with the given
+ * timestamp
+ */
 void kgsl_signal_event(struct kgsl_device *device,
 		struct kgsl_context *context, unsigned int timestamp,
 		unsigned int type)
@@ -151,6 +161,14 @@ void kgsl_signal_event(struct kgsl_device *device,
 }
 EXPORT_SYMBOL(kgsl_signal_event);
 
+/**
+ * kgsl_signal_events() - send a signal to all events in the context
+ * @device: Pointer to the KGSL device struct
+ * @context: Pointer to the KGSL context
+ * @type: Signal ID to send to the callback function
+ *
+ * Send the specified signal to all the events in the context
+ */
 void kgsl_signal_events(struct kgsl_device *device,
 		struct kgsl_context *context, unsigned int type)
 {
@@ -192,6 +210,7 @@ EXPORT_SYMBOL(kgsl_signal_events);
 int kgsl_add_event(struct kgsl_device *device, u32 id, u32 ts,
 	kgsl_event_func func, void *priv, void *owner)
 {
+	int ret;
 	struct kgsl_event *event;
 	unsigned int cur_ts;
 	struct kgsl_context *context = NULL;
@@ -229,6 +248,17 @@ int kgsl_add_event(struct kgsl_device *device, u32 id, u32 ts,
 		return -ENOMEM;
 	}
 
+	/*
+	 * Increase the active count on the device to avoid going into power
+	 * saving modes while events are pending
+	 */
+	ret = kgsl_active_count_get_light(device);
+	if (ret < 0) {
+		kgsl_context_put(context);
+		kfree(event);
+		return ret;
+	}
+
 	event->context = context;
 	event->timestamp = ts;
 	event->priv = priv;
@@ -255,23 +285,17 @@ int kgsl_add_event(struct kgsl_device *device, u32 id, u32 ts,
 	} else
 		_add_event_to_list(&device->events, event);
 
-	/*
-	 * Increase the active count on the device to avoid going into power
-	 * saving modes while events are pending
-	 */
-
-	device->active_cnt++;
-
 	queue_work(device->work_queue, &device->ts_expired_ws);
 	return 0;
 }
 EXPORT_SYMBOL(kgsl_add_event);
 
 /**
- * kgsl_cancel_events - Cancel all generic events for a process
- * @device - KGSL device for the events to cancel
- * @owner - driver instance that owns the events to cancel
+ * kgsl_cancel_events() - Cancel all global events owned by a process
+ * @device: Pointer to the KGSL device struct
+ * @owner: driver instance that owns the events to cancel
  *
+ * Cancel all global events that match the owner pointer
  */
 void kgsl_cancel_events(struct kgsl_device *device, void *owner)
 {
@@ -291,6 +315,19 @@ void kgsl_cancel_events(struct kgsl_device *device, void *owner)
 }
 EXPORT_SYMBOL(kgsl_cancel_events);
 
+/**
+ * kgsl_cancel_event() - send a cancel signal to a specific event
+ * @device: Pointer to the KGSL device struct
+ * @context: Pointer to the KGSL context
+ * @timestamp: Timestamp of the event to cancel
+ * @func: Callback function of the event - this is used to match the actual
+ * event
+ * @priv: Private data for the callback function - this is used to match to the
+ * actual event
+ *
+ * Send the a cancel signal to a specific event that matches all the parameters
+ */
+
 void kgsl_cancel_event(struct kgsl_device *device, struct kgsl_context *context,
 		unsigned int timestamp, kgsl_event_func func,
 		void *priv)
@@ -363,10 +400,19 @@ void kgsl_process_events(struct work_struct *work)
 	struct kgsl_context *context, *tmp;
 	uint32_t timestamp;
 
-	mutex_lock(&device->mutex);
+	/*
+	 * Bail unless the global timestamp has advanced.  We can safely do this
+	 * outside of the mutex for speed
+	 */
 
-	/* Process expired global events */
 	timestamp = kgsl_readtimestamp(device, NULL, KGSL_TIMESTAMP_RETIRED);
+	if (timestamp == device->events_last_timestamp)
+		return;
+
+	mutex_lock(&device->mutex);
+
+	device->events_last_timestamp = timestamp;
+
 	_retire_events(device, &device->events, timestamp);
 	_mark_next_event(device, &device->events);
 
@@ -374,6 +420,11 @@ void kgsl_process_events(struct work_struct *work)
 	list_for_each_entry_safe(context, tmp, &device->events_pending_list,
 		events_list) {
 
+		/*
+		 * Increment the refcount to make sure that the list_del_init
+		 * is called with a valid context's list
+		 */
+		_kgsl_context_get(context);
 		/*
 		 * If kgsl_timestamp_expired_context returns 0 then it no longer
 		 * has any pending events and can be removed from the list
@@ -381,6 +432,7 @@ void kgsl_process_events(struct work_struct *work)
 
 		if (kgsl_process_context_events(device, context) == 0)
 			list_del_init(&context->events_list);
+		kgsl_context_put(context);
 	}
 
 	mutex_unlock(&device->mutex);
diff --git a/drivers/gpu/msm/kgsl_gpummu.c b/drivers/gpu/msm/kgsl_gpummu.c
index 8f285053fb8943811e905a14bcfe48425ae9b33c..e52bb5dfd374795e09274c46e7f8ee6ae7e43e04 100644
--- a/drivers/gpu/msm/kgsl_gpummu.c
+++ b/drivers/gpu/msm/kgsl_gpummu.c
@@ -465,12 +465,12 @@ err_free_gpummu:
 	return NULL;
 }
 
-static void kgsl_gpummu_default_setstate(struct kgsl_mmu *mmu,
+static int kgsl_gpummu_default_setstate(struct kgsl_mmu *mmu,
 					uint32_t flags)
 {
 	struct kgsl_gpummu_pt *gpummu_pt;
 	if (!kgsl_mmu_enabled())
-		return;
+		return 0;
 
 	if (flags & KGSL_MMUFLAGS_PTUPDATE) {
 		kgsl_idle(mmu->device);
@@ -483,12 +483,16 @@ static void kgsl_gpummu_default_setstate(struct kgsl_mmu *mmu,
 		/* Invalidate all and tc */
 		kgsl_regwrite(mmu->device, MH_MMU_INVALIDATE,  0x00000003);
 	}
+
+	return 0;
 }
 
-static void kgsl_gpummu_setstate(struct kgsl_mmu *mmu,
+static int kgsl_gpummu_setstate(struct kgsl_mmu *mmu,
 				struct kgsl_pagetable *pagetable,
 				unsigned int context_id)
 {
+	int ret = 0;
+
 	if (mmu->flags & KGSL_FLAGS_STARTED) {
 		/* page table not current, then setup mmu to use new
 		 *  specified page table
@@ -501,10 +505,13 @@ static void kgsl_gpummu_setstate(struct kgsl_mmu *mmu,
 			kgsl_mmu_pt_get_flags(pagetable, mmu->device->id);
 
 			/* call device specific set page table */
-			kgsl_setstate(mmu, context_id, KGSL_MMUFLAGS_TLBFLUSH |
+			ret = kgsl_setstate(mmu, context_id,
+				KGSL_MMUFLAGS_TLBFLUSH |
 				KGSL_MMUFLAGS_PTUPDATE);
 		}
 	}
+
+	return ret;
 }
 
 static int kgsl_gpummu_init(struct kgsl_mmu *mmu)
@@ -541,6 +548,7 @@ static int kgsl_gpummu_start(struct kgsl_mmu *mmu)
 
 	struct kgsl_device *device = mmu->device;
 	struct kgsl_gpummu_pt *gpummu_pt;
+	int ret;
 
 	if (mmu->flags & KGSL_FLAGS_STARTED)
 		return 0;
@@ -552,9 +560,6 @@ static int kgsl_gpummu_start(struct kgsl_mmu *mmu)
 	/* setup MMU and sub-client behavior */
 	kgsl_regwrite(device, MH_MMU_CONFIG, mmu->config);
 
-	/* idle device */
-	kgsl_idle(device);
-
 	/* enable axi interrupts */
 	kgsl_regwrite(device, MH_INTERRUPT_MASK,
 			GSL_MMU_INT_MASK | MH_INTERRUPT_MASK__MMU_PAGE_FAULT);
@@ -585,10 +590,12 @@ static int kgsl_gpummu_start(struct kgsl_mmu *mmu)
 	kgsl_regwrite(mmu->device, MH_MMU_VA_RANGE,
 		      (KGSL_PAGETABLE_BASE |
 		      (CONFIG_MSM_KGSL_PAGE_TABLE_SIZE >> 16)));
-	kgsl_setstate(mmu, KGSL_MEMSTORE_GLOBAL, KGSL_MMUFLAGS_TLBFLUSH);
-	mmu->flags |= KGSL_FLAGS_STARTED;
 
-	return 0;
+	ret = kgsl_setstate(mmu, KGSL_MEMSTORE_GLOBAL, KGSL_MMUFLAGS_TLBFLUSH);
+	if (!ret)
+		mmu->flags |= KGSL_FLAGS_STARTED;
+
+	return ret;
 }
 
 static int
@@ -598,7 +605,7 @@ kgsl_gpummu_unmap(void *mmu_specific_pt,
 {
 	unsigned int numpages;
 	unsigned int pte, ptefirst, ptelast, superpte;
-	unsigned int range = kgsl_sg_size(memdesc->sg, memdesc->sglen);
+	unsigned int range = memdesc->size;
 	struct kgsl_gpummu_pt *gpummu_pt = mmu_specific_pt;
 
 	/* All GPU addresses as assigned are page aligned, but some
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index a12003ac126d0793e4c014ac9e3fd101d2e337d5..2c8abb1f6b3640d63a508728198f1e4c8ae91f8e 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -32,6 +32,7 @@
 #include "adreno.h"
 #include "kgsl_trace.h"
 #include "z180.h"
+#include "kgsl_cffdump.h"
 
 
 static struct kgsl_iommu_register_list kgsl_iommuv1_reg[KGSL_IOMMU_REG_MAX] = {
@@ -62,6 +63,13 @@ static struct kgsl_iommu_register_list kgsl_iommuv2_reg[KGSL_IOMMU_REG_MAX] = {
 
 struct remote_iommu_petersons_spinlock kgsl_iommu_sync_lock_vars;
 
+/*
+ * One page allocation for a guard region to protect against over-zealous
+ * GPU pre-fetch
+ */
+
+static struct page *kgsl_guard_page;
+
 static int get_iommu_unit(struct device *dev, struct kgsl_mmu **mmu_out,
 			struct kgsl_iommu_unit **iommu_unit_out)
 {
@@ -109,6 +117,170 @@ static struct kgsl_iommu_device *get_iommu_device(struct kgsl_iommu_unit *unit,
 	return NULL;
 }
 
+/* These functions help find the nearest allocated memory entries on either side
+ * of a faulting address. If we know the nearby allocations memory we can
+ * get a better determination of what we think should have been located in the
+ * faulting region
+ */
+
+/*
+ * A local structure to make it easy to store the interesting bits for the
+ * memory entries on either side of the faulting address
+ */
+
+struct _mem_entry {
+	unsigned int gpuaddr;
+	unsigned int size;
+	unsigned int flags;
+	unsigned int priv;
+	pid_t pid;
+};
+
+/*
+ * Find the closest alloated memory block with an smaller GPU address then the
+ * given address
+ */
+
+static void _prev_entry(struct kgsl_process_private *priv,
+	unsigned int faultaddr, struct _mem_entry *ret)
+{
+	struct rb_node *node;
+	struct kgsl_mem_entry *entry;
+
+	for (node = rb_first(&priv->mem_rb); node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+
+		if (entry->memdesc.gpuaddr > faultaddr)
+			break;
+
+		/*
+		 * If this is closer to the faulting address, then copy
+		 * the entry
+		 */
+
+		if (entry->memdesc.gpuaddr > ret->gpuaddr) {
+			ret->gpuaddr = entry->memdesc.gpuaddr;
+			ret->size = entry->memdesc.size;
+			ret->flags = entry->memdesc.flags;
+			ret->priv = entry->memdesc.priv;
+			ret->pid = priv->pid;
+		}
+
+		node = rb_next(&entry->node);
+	}
+}
+
+/*
+ * Find the closest alloated memory block with a greater starting GPU address
+ * then the given address
+ */
+
+static void _next_entry(struct kgsl_process_private *priv,
+	unsigned int faultaddr, struct _mem_entry *ret)
+{
+	struct rb_node *node;
+	struct kgsl_mem_entry *entry;
+
+	for (node = rb_last(&priv->mem_rb); node; ) {
+		entry = rb_entry(node, struct kgsl_mem_entry, node);
+
+		if (entry->memdesc.gpuaddr < faultaddr)
+			break;
+
+		/*
+		 * If this is closer to the faulting address, then copy
+		 * the entry
+		 */
+
+		if (entry->memdesc.gpuaddr < ret->gpuaddr) {
+			ret->gpuaddr = entry->memdesc.gpuaddr;
+			ret->size = entry->memdesc.size;
+			ret->flags = entry->memdesc.flags;
+			ret->priv = entry->memdesc.priv;
+			ret->pid = priv->pid;
+		}
+
+		node = rb_prev(&entry->node);
+	}
+}
+
+static void _find_mem_entries(struct kgsl_mmu *mmu, unsigned int faultaddr,
+	unsigned int ptbase, struct _mem_entry *preventry,
+	struct _mem_entry *nextentry)
+{
+	struct kgsl_process_private *private;
+	int id = kgsl_mmu_get_ptname_from_ptbase(mmu, ptbase);
+
+	memset(preventry, 0, sizeof(*preventry));
+	memset(nextentry, 0, sizeof(*nextentry));
+
+	/* Set the maximum possible size as an initial value */
+	nextentry->gpuaddr = 0xFFFFFFFF;
+
+	mutex_lock(&kgsl_driver.process_mutex);
+
+	list_for_each_entry(private, &kgsl_driver.process_list, list) {
+
+		if (private->pagetable->name != id)
+			continue;
+
+		spin_lock(&private->mem_lock);
+		_prev_entry(private, faultaddr, preventry);
+		_next_entry(private, faultaddr, nextentry);
+		spin_unlock(&private->mem_lock);
+	}
+
+	mutex_unlock(&kgsl_driver.process_mutex);
+}
+
+static void _print_entry(struct kgsl_device *device, struct _mem_entry *entry)
+{
+	char name[32];
+	memset(name, 0, sizeof(name));
+
+	kgsl_get_memory_usage(name, sizeof(name) - 1, entry->flags);
+
+	KGSL_LOG_DUMP(device,
+		"[%8.8X - %8.8X] %s (pid = %d) (%s)\n",
+		entry->gpuaddr,
+		entry->gpuaddr + entry->size,
+		entry->priv & KGSL_MEMDESC_GUARD_PAGE ? "(+guard)" : "",
+		entry->pid, name);
+}
+
+static void _check_if_freed(struct kgsl_iommu_device *iommu_dev,
+	unsigned long addr, unsigned int pid)
+{
+	void *base = kgsl_driver.memfree_hist.base_hist_rb;
+	struct kgsl_memfree_hist_elem *wptr;
+	struct kgsl_memfree_hist_elem *p;
+
+	mutex_lock(&kgsl_driver.memfree_hist_mutex);
+	wptr = kgsl_driver.memfree_hist.wptr;
+	p = wptr;
+	for (;;) {
+		if (p->size && p->pid == pid)
+			if (addr >= p->gpuaddr &&
+				addr < (p->gpuaddr + p->size)) {
+
+				KGSL_LOG_DUMP(iommu_dev->kgsldev,
+					"---- premature free ----\n");
+				KGSL_LOG_DUMP(iommu_dev->kgsldev,
+					"[%8.8X-%8.8X] was already freed by pid %d\n",
+					p->gpuaddr,
+					p->gpuaddr + p->size,
+					p->pid);
+			}
+		p++;
+		if ((void *)p >= base + kgsl_driver.memfree_hist.size)
+			p = (struct kgsl_memfree_hist_elem *) base;
+
+		if (p == kgsl_driver.memfree_hist.wptr)
+			break;
+	}
+	mutex_unlock(&kgsl_driver.memfree_hist_mutex);
+}
+
 static int kgsl_iommu_fault_handler(struct iommu_domain *domain,
 	struct device *dev, unsigned long addr, int flags)
 {
@@ -124,6 +296,7 @@ static int kgsl_iommu_fault_handler(struct iommu_domain *domain,
 	unsigned int pid;
 	unsigned int fsynr0, fsynr1;
 	int write;
+	struct _mem_entry prev, next;
 
 	ret = get_iommu_unit(dev, &mmu, &iommu_unit);
 	if (ret)
@@ -168,6 +341,24 @@ static int kgsl_iommu_fault_handler(struct iommu_domain *domain,
 			write ? "write" : "read");
 	}
 
+	_check_if_freed(iommu_dev, addr, pid);
+
+	KGSL_LOG_DUMP(iommu_dev->kgsldev, "---- nearby memory ----\n");
+
+	_find_mem_entries(mmu, addr, ptbase, &prev, &next);
+
+	if (prev.gpuaddr)
+		_print_entry(iommu_dev->kgsldev, &prev);
+	else
+		KGSL_LOG_DUMP(iommu_dev->kgsldev, "*EMPTY*\n");
+
+	KGSL_LOG_DUMP(iommu_dev->kgsldev, " <- fault @ %8.8lX\n", addr);
+
+	if (next.gpuaddr != 0xFFFFFFFF)
+		_print_entry(iommu_dev->kgsldev, &next);
+	else
+		KGSL_LOG_DUMP(iommu_dev->kgsldev, "*EMPTY*\n");
+
 	mmu->fault = 1;
 	iommu_dev->fault = 1;
 
@@ -648,13 +839,10 @@ static int kgsl_iommu_init_sync_lock(struct kgsl_mmu *mmu)
 		return status;
 
 	/* Map Lock variables to GPU pagetable */
-	iommu->sync_lock_desc.priv |= KGSL_MEMDESC_GLOBAL;
-
 	pagetable = mmu->priv_bank_table ? mmu->priv_bank_table :
 				mmu->defaultpagetable;
 
-	status = kgsl_mmu_map(pagetable, &iommu->sync_lock_desc,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	status = kgsl_mmu_map_global(pagetable, &iommu->sync_lock_desc);
 
 	if (status) {
 		kgsl_mmu_unmap(pagetable, &iommu->sync_lock_desc);
@@ -914,10 +1102,12 @@ static int kgsl_iommu_get_pt_lsb(struct kgsl_mmu *mmu,
 	return 0;
 }
 
-static void kgsl_iommu_setstate(struct kgsl_mmu *mmu,
+static int kgsl_iommu_setstate(struct kgsl_mmu *mmu,
 				struct kgsl_pagetable *pagetable,
 				unsigned int context_id)
 {
+	int ret = 0;
+
 	if (mmu->flags & KGSL_FLAGS_STARTED) {
 		/* page table not current, then setup mmu to use new
 		 *  specified page table
@@ -928,10 +1118,12 @@ static void kgsl_iommu_setstate(struct kgsl_mmu *mmu,
 			flags |= kgsl_mmu_pt_get_flags(mmu->hwpagetable,
 							mmu->device->id) |
 							KGSL_MMUFLAGS_TLBFLUSH;
-			kgsl_setstate(mmu, context_id,
+			ret = kgsl_setstate(mmu, context_id,
 				KGSL_MMUFLAGS_PTUPDATE | flags);
 		}
 	}
+
+	return ret;
 }
 
 /*
@@ -959,23 +1151,18 @@ static int kgsl_iommu_setup_regs(struct kgsl_mmu *mmu,
 		return 0;
 
 	for (i = 0; i < iommu->unit_count; i++) {
-		iommu->iommu_units[i].reg_map.priv |= KGSL_MEMDESC_GLOBAL;
-		status = kgsl_mmu_map(pt,
-				&(iommu->iommu_units[i].reg_map),
-				GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-		if (status) {
-			iommu->iommu_units[i].reg_map.priv &=
-				~KGSL_MEMDESC_GLOBAL;
+		status = kgsl_mmu_map_global(pt,
+				&(iommu->iommu_units[i].reg_map));
+		if (status)
 			goto err;
-		}
 	}
+
 	return 0;
 err:
-	for (i--; i >= 0; i--) {
+	for (i--; i >= 0; i--)
 		kgsl_mmu_unmap(pt,
 				&(iommu->iommu_units[i].reg_map));
-		iommu->iommu_units[i].reg_map.priv &= ~KGSL_MEMDESC_GLOBAL;
-	}
+
 	return status;
 }
 
@@ -1049,6 +1236,15 @@ static int kgsl_iommu_init(struct kgsl_mmu *mmu)
 		iommu_ops.mmu_cleanup_pt = kgsl_iommu_cleanup_regs;
 	}
 
+	if (kgsl_guard_page == NULL) {
+		kgsl_guard_page = alloc_page(GFP_KERNEL | __GFP_ZERO |
+				__GFP_HIGHMEM);
+		if (kgsl_guard_page == NULL) {
+			status = -ENOMEM;
+			goto done;
+		}
+	}
+
 	dev_info(mmu->device->dev, "|%s| MMU type set for device is IOMMU\n",
 			__func__);
 done:
@@ -1241,8 +1437,6 @@ static int kgsl_iommu_start(struct kgsl_mmu *mmu)
 
 		kgsl_regwrite(mmu->device, MH_MMU_MPU_END,
 			mh->mpu_base + mh->mpu_range);
-	} else {
-		kgsl_regwrite(mmu->device, MH_MMU_CONFIG, 0x00000000);
 	}
 
 	mmu->hwpagetable = mmu->defaultpagetable;
@@ -1281,6 +1475,10 @@ static int kgsl_iommu_start(struct kgsl_mmu *mmu)
 	kgsl_iommu_lock_rb_in_tlb(mmu);
 	msm_iommu_unlock();
 
+	/* For complete CFF */
+	kgsl_cffdump_setmem(mmu->setstate_memory.gpuaddr +
+				KGSL_IOMMU_SETSTATE_NOP_OFFSET,
+				cp_nop_packet(1), sizeof(unsigned int));
 
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
 	mmu->flags |= KGSL_FLAGS_STARTED;
@@ -1299,7 +1497,7 @@ kgsl_iommu_unmap(void *mmu_specific_pt,
 		unsigned int *tlb_flags)
 {
 	int ret;
-	unsigned int range = kgsl_sg_size(memdesc->sg, memdesc->sglen);
+	unsigned int range = memdesc->size;
 	struct kgsl_iommu_pt *iommu_pt = mmu_specific_pt;
 
 	/* All GPU addresses as assigned are page aligned, but some
@@ -1311,6 +1509,9 @@ kgsl_iommu_unmap(void *mmu_specific_pt,
 	if (range == 0 || gpuaddr == 0)
 		return 0;
 
+	if (kgsl_memdesc_has_guard_page(memdesc))
+		range += PAGE_SIZE;
+
 	ret = iommu_unmap_range(iommu_pt->domain, gpuaddr, range);
 	if (ret)
 		KGSL_CORE_ERR("iommu_unmap_range(%p, %x, %d) failed "
@@ -1335,26 +1536,35 @@ kgsl_iommu_map(void *mmu_specific_pt,
 	int ret;
 	unsigned int iommu_virt_addr;
 	struct kgsl_iommu_pt *iommu_pt = mmu_specific_pt;
-	int size = kgsl_sg_size(memdesc->sg, memdesc->sglen);
-	unsigned int iommu_flags = IOMMU_READ;
+	int size = memdesc->size;
 
 	BUG_ON(NULL == iommu_pt);
 
-	if (protflags & GSL_PT_PAGE_WV)
-		iommu_flags |= IOMMU_WRITE;
-
 	iommu_virt_addr = memdesc->gpuaddr;
 
 	ret = iommu_map_range(iommu_pt->domain, iommu_virt_addr, memdesc->sg,
-				size, iommu_flags);
+				size, protflags);
 	if (ret) {
-		KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %d) "
-				"failed with err: %d\n", iommu_pt->domain,
-				iommu_virt_addr, memdesc->sg, size,
-				iommu_flags, ret);
+		KGSL_CORE_ERR("iommu_map_range(%p, %x, %p, %d, %x) err: %d\n",
+			iommu_pt->domain, iommu_virt_addr, memdesc->sg, size,
+			protflags, ret);
 		return ret;
 	}
-
+	if (kgsl_memdesc_has_guard_page(memdesc)) {
+		ret = iommu_map(iommu_pt->domain, iommu_virt_addr + size,
+				page_to_phys(kgsl_guard_page), PAGE_SIZE,
+				protflags & ~IOMMU_WRITE);
+		if (ret) {
+			KGSL_CORE_ERR("iommu_map(%p, %x, %x, %x) err: %d\n",
+				iommu_pt->domain, iommu_virt_addr + size,
+				page_to_phys(kgsl_guard_page),
+				protflags & ~IOMMU_WRITE,
+				ret);
+			/* cleanup the partial mapping */
+			iommu_unmap_range(iommu_pt->domain, iommu_virt_addr,
+					  size);
+		}
+	}
 	return ret;
 }
 
@@ -1423,6 +1633,11 @@ static int kgsl_iommu_close(struct kgsl_mmu *mmu)
 
 	kfree(iommu);
 
+	if (kgsl_guard_page != NULL) {
+		__free_page(kgsl_guard_page);
+		kgsl_guard_page = NULL;
+	}
+
 	return 0;
 }
 
@@ -1458,19 +1673,22 @@ kgsl_iommu_get_current_ptbase(struct kgsl_mmu *mmu)
  * cpu
  * Return - void
  */
-static void kgsl_iommu_default_setstate(struct kgsl_mmu *mmu,
+static int kgsl_iommu_default_setstate(struct kgsl_mmu *mmu,
 					uint32_t flags)
 {
 	struct kgsl_iommu *iommu = mmu->priv;
 	int temp;
 	int i;
+	int ret = 0;
 	unsigned int pt_base = kgsl_iommu_get_pt_base_addr(mmu,
 						mmu->hwpagetable);
 	unsigned int pt_val;
 
-	if (kgsl_iommu_enable_clk(mmu, KGSL_IOMMU_CONTEXT_USER)) {
+	ret = kgsl_iommu_enable_clk(mmu, KGSL_IOMMU_CONTEXT_USER);
+
+	if (ret) {
 		KGSL_DRV_ERR(mmu->device, "Failed to enable iommu clocks\n");
-		return;
+		return ret;
 	}
 	/* Mask off the lsb of the pt base address since lsb will not change */
 	pt_base &= (iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
@@ -1513,6 +1731,7 @@ static void kgsl_iommu_default_setstate(struct kgsl_mmu *mmu,
 
 	/* Disable smmu clock */
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
+	return ret;
 }
 
 /*
@@ -1554,6 +1773,7 @@ struct kgsl_mmu_ops iommu_ops = {
 	.mmu_pagefault = NULL,
 	.mmu_get_current_ptbase = kgsl_iommu_get_current_ptbase,
 	.mmu_enable_clk = kgsl_iommu_enable_clk,
+	.mmu_disable_clk = kgsl_iommu_disable_clk,
 	.mmu_disable_clk_on_ts = kgsl_iommu_disable_clk_on_ts,
 	.mmu_get_pt_lsb = kgsl_iommu_get_pt_lsb,
 	.mmu_get_reg_gpuaddr = kgsl_iommu_get_reg_gpuaddr,
diff --git a/drivers/gpu/msm/kgsl_log.h b/drivers/gpu/msm/kgsl_log.h
index 83d14f79cc7f956b7e0ef70a851e06b517b5ceb1..81a35e0d20e5b784828f4eec58f7fae54b241a6d 100644
--- a/drivers/gpu/msm/kgsl_log.h
+++ b/drivers/gpu/msm/kgsl_log.h
@@ -103,15 +103,6 @@ KGSL_LOG_ERR(_dev->dev, _dev->pwr_log, fmt, ##args)
 #define KGSL_PWR_CRIT(_dev, fmt, args...) \
 KGSL_LOG_CRIT(_dev->dev, _dev->pwr_log, fmt, ##args)
 
-#define KGSL_FT_INFO(_dev, fmt, args...) \
-KGSL_LOG_INFO(_dev->dev, _dev->ft_log, fmt, ##args)
-#define KGSL_FT_WARN(_dev, fmt, args...) \
-KGSL_LOG_WARN(_dev->dev, _dev->ft_log, fmt, ##args)
-#define KGSL_FT_ERR(_dev, fmt, args...) \
-KGSL_LOG_ERR(_dev->dev, _dev->ft_log, fmt, ##args)
-#define KGSL_FT_CRIT(_dev, fmt, args...) \
-KGSL_LOG_CRIT(_dev->dev, _dev->ft_log, fmt, ##args)
-
 /* Core error messages - these are for core KGSL functions that have
    no device associated with them (such as memory) */
 
diff --git a/drivers/gpu/msm/kgsl_mmu.c b/drivers/gpu/msm/kgsl_mmu.c
index b2507a3aec39560e2103d50ea3b3761b7a1cd4e9..ea127d266e10b6e1d3a7824eb15c2b2605755ea2 100644
--- a/drivers/gpu/msm/kgsl_mmu.c
+++ b/drivers/gpu/msm/kgsl_mmu.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -560,7 +560,7 @@ void kgsl_mmu_putpagetable(struct kgsl_pagetable *pagetable)
 }
 EXPORT_SYMBOL(kgsl_mmu_putpagetable);
 
-void kgsl_setstate(struct kgsl_mmu *mmu, unsigned int context_id,
+int kgsl_setstate(struct kgsl_mmu *mmu, unsigned int context_id,
 			uint32_t flags)
 {
 	struct kgsl_device *device = mmu->device;
@@ -568,14 +568,16 @@ void kgsl_setstate(struct kgsl_mmu *mmu, unsigned int context_id,
 
 	if (!(flags & (KGSL_MMUFLAGS_TLBFLUSH | KGSL_MMUFLAGS_PTUPDATE))
 		&& !adreno_is_a2xx(adreno_dev))
-		return;
+		return 0;
 
 	if (KGSL_MMU_TYPE_NONE == kgsl_mmu_type)
-		return;
+		return 0;
 	else if (device->ftbl->setstate)
-		device->ftbl->setstate(device, context_id, flags);
+		return device->ftbl->setstate(device, context_id, flags);
 	else if (mmu->mmu_ops->mmu_device_setstate)
-		mmu->mmu_ops->mmu_device_setstate(mmu, flags);
+		return mmu->mmu_ops->mmu_device_setstate(mmu, flags);
+
+	return 0;
 }
 EXPORT_SYMBOL(kgsl_setstate);
 
@@ -584,7 +586,6 @@ void kgsl_mh_start(struct kgsl_device *device)
 	struct kgsl_mh *mh = &device->mh;
 	/* force mmu off to for now*/
 	kgsl_regwrite(device, MH_MMU_CONFIG, 0);
-	kgsl_idle(device);
 
 	/* define physical memory range accessible by the core */
 	kgsl_regwrite(device, MH_MMU_MPU_BASE, mh->mpu_base);
@@ -605,16 +606,17 @@ void kgsl_mh_start(struct kgsl_device *device)
 	 * kgsl_pwrctrl_irq() is called
 	 */
 }
+EXPORT_SYMBOL(kgsl_mh_start);
 
 int
 kgsl_mmu_map(struct kgsl_pagetable *pagetable,
-				struct kgsl_memdesc *memdesc,
-				unsigned int protflags)
+				struct kgsl_memdesc *memdesc)
 {
 	int ret;
 	struct gen_pool *pool = NULL;
 	int size;
 	int page_align = ilog2(PAGE_SIZE);
+	unsigned int protflags = kgsl_memdesc_protflags(memdesc);
 
 	if (kgsl_mmu_type == KGSL_MMU_TYPE_NONE) {
 		if (memdesc->sglen == 1) {
@@ -634,7 +636,10 @@ kgsl_mmu_map(struct kgsl_pagetable *pagetable,
 		}
 	}
 
-	size = kgsl_sg_size(memdesc->sg, memdesc->sglen);
+	/* Add space for the guard page when allocating the mmu VA. */
+	size = memdesc->size;
+	if (kgsl_memdesc_has_guard_page(memdesc))
+		size += PAGE_SIZE;
 
 	pool = pagetable->pool;
 
@@ -732,7 +737,10 @@ kgsl_mmu_unmap(struct kgsl_pagetable *pagetable,
 		return 0;
 	}
 
-	size = kgsl_sg_size(memdesc->sg, memdesc->sglen);
+	/* Add space for the guard page when freeing the mmu VA. */
+	size = memdesc->size;
+	if (kgsl_memdesc_has_guard_page(memdesc))
+		size += PAGE_SIZE;
 
 	start_addr = memdesc->gpuaddr;
 	end_addr = (memdesc->gpuaddr + size);
@@ -777,7 +785,7 @@ kgsl_mmu_unmap(struct kgsl_pagetable *pagetable,
 EXPORT_SYMBOL(kgsl_mmu_unmap);
 
 int kgsl_mmu_map_global(struct kgsl_pagetable *pagetable,
-			struct kgsl_memdesc *memdesc, unsigned int protflags)
+			struct kgsl_memdesc *memdesc)
 {
 	int result = -EINVAL;
 	unsigned int gpuaddr = 0;
@@ -789,11 +797,10 @@ int kgsl_mmu_map_global(struct kgsl_pagetable *pagetable,
 	/* Not all global mappings are needed for all MMU types */
 	if (!memdesc->size)
 		return 0;
-
 	gpuaddr = memdesc->gpuaddr;
 	memdesc->priv |= KGSL_MEMDESC_GLOBAL;
 
-	result = kgsl_mmu_map(pagetable, memdesc, protflags);
+	result = kgsl_mmu_map(pagetable, memdesc);
 	if (result)
 		goto error;
 
diff --git a/drivers/gpu/msm/kgsl_mmu.h b/drivers/gpu/msm/kgsl_mmu.h
index 9d1bffa945454b1d59e73c61778c42f8a8ab717a..fe1b2ee58eadd6a14aee76c0b72847bf8dedd35e 100644
--- a/drivers/gpu/msm/kgsl_mmu.h
+++ b/drivers/gpu/msm/kgsl_mmu.h
@@ -125,10 +125,10 @@ struct kgsl_mmu_ops {
 	int (*mmu_close) (struct kgsl_mmu *mmu);
 	int (*mmu_start) (struct kgsl_mmu *mmu);
 	void (*mmu_stop) (struct kgsl_mmu *mmu);
-	void (*mmu_setstate) (struct kgsl_mmu *mmu,
+	int (*mmu_setstate) (struct kgsl_mmu *mmu,
 		struct kgsl_pagetable *pagetable,
 		unsigned int context_id);
-	void (*mmu_device_setstate) (struct kgsl_mmu *mmu,
+	int (*mmu_device_setstate) (struct kgsl_mmu *mmu,
 					uint32_t flags);
 	void (*mmu_pagefault) (struct kgsl_mmu *mmu);
 	unsigned int (*mmu_get_current_ptbase)
@@ -137,6 +137,8 @@ struct kgsl_mmu_ops {
 		(struct kgsl_mmu *mmu, uint32_t ts, bool ts_valid);
 	int (*mmu_enable_clk)
 		(struct kgsl_mmu *mmu, int ctx_id);
+	void (*mmu_disable_clk)
+		(struct kgsl_mmu *mmu);
 	int (*mmu_get_pt_lsb)(struct kgsl_mmu *mmu,
 				unsigned int unit_id,
 				enum kgsl_iommu_context_id ctx_id);
@@ -204,14 +206,13 @@ int kgsl_mmu_init(struct kgsl_device *device);
 int kgsl_mmu_start(struct kgsl_device *device);
 int kgsl_mmu_close(struct kgsl_device *device);
 int kgsl_mmu_map(struct kgsl_pagetable *pagetable,
-		 struct kgsl_memdesc *memdesc,
-		 unsigned int protflags);
+		 struct kgsl_memdesc *memdesc);
 int kgsl_mmu_map_global(struct kgsl_pagetable *pagetable,
-			struct kgsl_memdesc *memdesc, unsigned int protflags);
+			struct kgsl_memdesc *memdesc);
 int kgsl_mmu_unmap(struct kgsl_pagetable *pagetable,
 		    struct kgsl_memdesc *memdesc);
 unsigned int kgsl_virtaddr_to_physaddr(void *virtaddr);
-void kgsl_setstate(struct kgsl_mmu *mmu, unsigned int context_id,
+int kgsl_setstate(struct kgsl_mmu *mmu, unsigned int context_id,
 			uint32_t flags);
 int kgsl_mmu_get_ptname_from_ptbase(struct kgsl_mmu *mmu,
 					unsigned int pt_base);
@@ -240,19 +241,23 @@ static inline unsigned int kgsl_mmu_get_current_ptbase(struct kgsl_mmu *mmu)
 		return 0;
 }
 
-static inline void kgsl_mmu_setstate(struct kgsl_mmu *mmu,
+static inline int kgsl_mmu_setstate(struct kgsl_mmu *mmu,
 			struct kgsl_pagetable *pagetable,
 			unsigned int context_id)
 {
 	if (mmu->mmu_ops && mmu->mmu_ops->mmu_setstate)
-		mmu->mmu_ops->mmu_setstate(mmu, pagetable, context_id);
+		return mmu->mmu_ops->mmu_setstate(mmu, pagetable, context_id);
+
+	return 0;
 }
 
-static inline void kgsl_mmu_device_setstate(struct kgsl_mmu *mmu,
+static inline int kgsl_mmu_device_setstate(struct kgsl_mmu *mmu,
 						uint32_t flags)
 {
 	if (mmu->mmu_ops && mmu->mmu_ops->mmu_device_setstate)
-		mmu->mmu_ops->mmu_device_setstate(mmu, flags);
+		return mmu->mmu_ops->mmu_device_setstate(mmu, flags);
+
+	return 0;
 }
 
 static inline void kgsl_mmu_stop(struct kgsl_mmu *mmu)
@@ -299,6 +304,12 @@ static inline int kgsl_mmu_enable_clk(struct kgsl_mmu *mmu,
 		return 0;
 }
 
+static inline void kgsl_mmu_disable_clk(struct kgsl_mmu *mmu)
+{
+	if (mmu->mmu_ops && mmu->mmu_ops->mmu_disable_clk)
+		mmu->mmu_ops->mmu_disable_clk(mmu);
+}
+
 static inline void kgsl_mmu_disable_clk_on_ts(struct kgsl_mmu *mmu,
 						unsigned int ts, bool ts_valid)
 {
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index 452d8a3180913bb725418b63a83ae533c9672954..bcdce85b24b315b2fad01bb3c1755df0a9f93300 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -1012,6 +1012,14 @@ void kgsl_pwrctrl_close(struct kgsl_device *device)
 	pwr->power_flags = 0;
 }
 
+/**
+ * kgsl_idle_check() - Work function for GPU interrupts and idle timeouts.
+ * @device: The device
+ *
+ * This function is called for work that is queued by the interrupt
+ * handler or the idle timer. It attempts to transition to a clocks
+ * off state if the active_cnt is 0 and the hardware is idle.
+ */
 void kgsl_idle_check(struct work_struct *work)
 {
 	struct kgsl_device *device = container_of(work, struct kgsl_device,
@@ -1021,15 +1029,24 @@ void kgsl_idle_check(struct work_struct *work)
 		return;
 
 	mutex_lock(&device->mutex);
-	if (device->state & (KGSL_STATE_ACTIVE | KGSL_STATE_NAP)) {
-		kgsl_pwrscale_idle(device);
 
+	kgsl_pwrscale_idle(device);
+
+	if (device->state == KGSL_STATE_ACTIVE
+		|| device->state == KGSL_STATE_NAP) {
+
+		/* If we failed to sleep then reset the timer and try again */
 		if (kgsl_pwrctrl_sleep(device) != 0) {
+
+			kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
+
 			mod_timer(&device->idle_timer,
 					jiffies +
 					device->pwrctrl.interval_timeout);
-			/* If the GPU has been too busy to sleep, make sure *
-			 * that is acurately reflected in the % busy numbers. */
+			/*
+			 * If the GPU has been too busy to sleep, make sure
+			 * that is acurately reflected in the % busy numbers.
+			 */
 			device->pwrctrl.clk_stats.no_nap_cnt++;
 			if (device->pwrctrl.clk_stats.no_nap_cnt >
 							 UPDATE_BUSY) {
@@ -1037,13 +1054,11 @@ void kgsl_idle_check(struct work_struct *work)
 				device->pwrctrl.clk_stats.no_nap_cnt = 0;
 			}
 		}
-	} else if (device->state & (KGSL_STATE_HUNG |
-					KGSL_STATE_DUMP_AND_FT)) {
-		kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 	}
 
 	mutex_unlock(&device->mutex);
 }
+EXPORT_SYMBOL(kgsl_idle_check);
 
 void kgsl_timer(unsigned long data)
 {
@@ -1061,54 +1076,26 @@ void kgsl_timer(unsigned long data)
 	}
 }
 
+
+/**
+ * kgsl_pre_hwaccess - Enforce preconditions for touching registers
+ * @device: The device
+ *
+ * This function ensures that the correct lock is held and that the GPU
+ * clock is on immediately before a register is read or written. Note
+ * that this function does not check active_cnt because the registers
+ * must be accessed during device start and stop, when the active_cnt
+ * may legitimately be 0.
+ */
 void kgsl_pre_hwaccess(struct kgsl_device *device)
 {
+	/* In order to touch a register you must hold the device mutex...*/
 	BUG_ON(!mutex_is_locked(&device->mutex));
-	switch (device->state) {
-	case KGSL_STATE_ACTIVE:
-		return;
-	case KGSL_STATE_NAP:
-	case KGSL_STATE_SLEEP:
-	case KGSL_STATE_SLUMBER:
-		kgsl_pwrctrl_wake(device);
-		break;
-	case KGSL_STATE_SUSPEND:
-		kgsl_check_suspended(device);
-		break;
-	case KGSL_STATE_INIT:
-	case KGSL_STATE_HUNG:
-	case KGSL_STATE_DUMP_AND_FT:
-		if (test_bit(KGSL_PWRFLAGS_CLK_ON,
-					 &device->pwrctrl.power_flags))
-			break;
-		else
-			KGSL_PWR_ERR(device,
-					"hw access while clocks off from state %d\n",
-					device->state);
-		break;
-	default:
-		KGSL_PWR_ERR(device, "hw access while in unknown state %d\n",
-					 device->state);
-		break;
-	}
+	/* and have the clock on! */
+	BUG_ON(!test_bit(KGSL_PWRFLAGS_CLK_ON, &device->pwrctrl.power_flags));
 }
 EXPORT_SYMBOL(kgsl_pre_hwaccess);
 
-void kgsl_check_suspended(struct kgsl_device *device)
-{
-	if (device->requested_state == KGSL_STATE_SUSPEND ||
-				device->state == KGSL_STATE_SUSPEND) {
-		mutex_unlock(&device->mutex);
-		wait_for_completion(&device->hwaccess_gate);
-		mutex_lock(&device->mutex);
-	} else if (device->state == KGSL_STATE_DUMP_AND_FT) {
-		mutex_unlock(&device->mutex);
-		wait_for_completion(&device->ft_gate);
-		mutex_lock(&device->mutex);
-	} else if (device->state == KGSL_STATE_SLUMBER)
-		kgsl_pwrctrl_wake(device);
-}
-
 static int
 _nap(struct kgsl_device *device)
 {
@@ -1187,6 +1174,8 @@ _slumber(struct kgsl_device *device)
 	case KGSL_STATE_NAP:
 	case KGSL_STATE_SLEEP:
 		del_timer_sync(&device->idle_timer);
+		/* make sure power is on to stop the device*/
+		kgsl_pwrctrl_enable(device);
 		device->ftbl->suspend_context(device);
 		device->ftbl->stop(device);
 		_sleep_accounting(device);
@@ -1236,9 +1225,9 @@ EXPORT_SYMBOL(kgsl_pwrctrl_sleep);
 
 /******************************************************************/
 /* Caller must hold the device mutex. */
-void kgsl_pwrctrl_wake(struct kgsl_device *device)
+int kgsl_pwrctrl_wake(struct kgsl_device *device)
 {
-	int status;
+	int status = 0;
 	unsigned int context_id;
 	unsigned int state = device->state;
 	unsigned int ts_processed = 0xdeaddead;
@@ -1247,7 +1236,7 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device)
 	kgsl_pwrctrl_request_state(device, KGSL_STATE_ACTIVE);
 	switch (device->state) {
 	case KGSL_STATE_SLUMBER:
-		status = device->ftbl->start(device, 0);
+		status = device->ftbl->start(device);
 		if (status) {
 			kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
 			KGSL_DRV_ERR(device, "start failed %d\n", status);
@@ -1276,9 +1265,6 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device)
 		/* Enable state before turning on irq */
 		kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
 		kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
-		/* Re-enable HW access */
-		mod_timer(&device->idle_timer,
-				jiffies + device->pwrctrl.interval_timeout);
 		pm_qos_update_request(&device->pm_qos_req_dma,
 					GPU_SWFI_LATENCY);
 	case KGSL_STATE_ACTIVE:
@@ -1288,8 +1274,10 @@ void kgsl_pwrctrl_wake(struct kgsl_device *device)
 		KGSL_PWR_WARN(device, "unhandled state %s\n",
 				kgsl_pwrstate_to_str(device->state));
 		kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
+		status = -EINVAL;
 		break;
 	}
+	return status;
 }
 EXPORT_SYMBOL(kgsl_pwrctrl_wake);
 
@@ -1342,10 +1330,6 @@ const char *kgsl_pwrstate_to_str(unsigned int state)
 		return "SLEEP";
 	case KGSL_STATE_SUSPEND:
 		return "SUSPEND";
-	case KGSL_STATE_HUNG:
-		return "HUNG";
-	case KGSL_STATE_DUMP_AND_FT:
-		return "DNR";
 	case KGSL_STATE_SLUMBER:
 		return "SLUMBER";
 	default:
@@ -1355,3 +1339,118 @@ const char *kgsl_pwrstate_to_str(unsigned int state)
 }
 EXPORT_SYMBOL(kgsl_pwrstate_to_str);
 
+
+/**
+ * kgsl_active_count_get() - Increase the device active count
+ * @device: Pointer to a KGSL device
+ *
+ * Increase the active count for the KGSL device and turn on
+ * clocks if this is the first reference. Code paths that need
+ * to touch the hardware or wait for the hardware to complete
+ * an operation must hold an active count reference until they
+ * are finished. An error code will be returned if waking the
+ * device fails. The device mutex must be held while *calling
+ * this function.
+ */
+int kgsl_active_count_get(struct kgsl_device *device)
+{
+	int ret = 0;
+	BUG_ON(!mutex_is_locked(&device->mutex));
+
+	if (atomic_read(&device->active_cnt) == 0) {
+		if (device->requested_state == KGSL_STATE_SUSPEND ||
+				device->state == KGSL_STATE_SUSPEND) {
+			mutex_unlock(&device->mutex);
+			wait_for_completion(&device->hwaccess_gate);
+			mutex_lock(&device->mutex);
+		}
+
+		/* Stop the idle timer */
+		del_timer_sync(&device->idle_timer);
+
+		ret = kgsl_pwrctrl_wake(device);
+	}
+	if (ret == 0)
+		atomic_inc(&device->active_cnt);
+	trace_kgsl_active_count(device,
+		(unsigned long) __builtin_return_address(0));
+	return ret;
+}
+EXPORT_SYMBOL(kgsl_active_count_get);
+
+/**
+ * kgsl_active_count_get_light() - Increase the device active count
+ * @device: Pointer to a KGSL device
+ *
+ * Increase the active count for the KGSL device WITHOUT
+ * turning on the clocks based on the assumption that the clocks are already
+ * on from a previous active_count_get(). Currently this is only used for
+ * creating kgsl_events.
+ */
+int kgsl_active_count_get_light(struct kgsl_device *device)
+{
+	if (atomic_inc_not_zero(&device->active_cnt) == 0) {
+		dev_WARN_ONCE(device->dev, 1, "active count is 0!\n");
+		return -EINVAL;
+	}
+
+	trace_kgsl_active_count(device,
+		(unsigned long) __builtin_return_address(0));
+	return 0;
+}
+EXPORT_SYMBOL(kgsl_active_count_get_light);
+
+/**
+ * kgsl_active_count_put() - Decrease the device active count
+ * @device: Pointer to a KGSL device
+ *
+ * Decrease the active count for the KGSL device and turn off
+ * clocks if there are no remaining references. This function will
+ * transition the device to NAP if there are no other pending state
+ * changes. It also completes the suspend gate.  The device mutex must
+ * be held while calling this function.
+ */
+void kgsl_active_count_put(struct kgsl_device *device)
+{
+	BUG_ON(!mutex_is_locked(&device->mutex));
+	BUG_ON(atomic_read(&device->active_cnt) == 0);
+
+	kgsl_pwrscale_idle(device);
+
+	if (atomic_dec_and_test(&device->active_cnt)) {
+		INIT_COMPLETION(device->suspend_gate);
+
+		if (device->pwrctrl.nap_allowed == true) {
+			/* Request nap */
+			kgsl_pwrctrl_request_state(device, KGSL_STATE_NAP);
+			kgsl_pwrctrl_sleep(device);
+		}
+
+		mod_timer(&device->idle_timer,
+			jiffies + device->pwrctrl.interval_timeout);
+
+		complete(&device->suspend_gate);
+	}
+
+	trace_kgsl_active_count(device,
+		(unsigned long) __builtin_return_address(0));
+}
+EXPORT_SYMBOL(kgsl_active_count_put);
+
+/**
+ * kgsl_active_count_wait() - Wait for activity to finish.
+ * @device: Pointer to a KGSL device
+ *
+ * Block until all active_cnt users put() their reference.
+ */
+void kgsl_active_count_wait(struct kgsl_device *device)
+{
+	BUG_ON(!mutex_is_locked(&device->mutex));
+
+	if (atomic_read(&device->active_cnt) != 0) {
+		mutex_unlock(&device->mutex);
+		wait_for_completion(&device->suspend_gate);
+		mutex_lock(&device->mutex);
+	}
+}
+EXPORT_SYMBOL(kgsl_active_count_wait);
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.h b/drivers/gpu/msm/kgsl_pwrctrl.h
index 8d66505cd5dbf4d5705d9d0924cc02a737f8dcc0..94cd8eb5b52e2efa09bc247e97fa730e0523338c 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.h
+++ b/drivers/gpu/msm/kgsl_pwrctrl.h
@@ -93,9 +93,8 @@ void kgsl_pwrctrl_close(struct kgsl_device *device);
 void kgsl_timer(unsigned long data);
 void kgsl_idle_check(struct work_struct *work);
 void kgsl_pre_hwaccess(struct kgsl_device *device);
-void kgsl_check_suspended(struct kgsl_device *device);
 int kgsl_pwrctrl_sleep(struct kgsl_device *device);
-void kgsl_pwrctrl_wake(struct kgsl_device *device);
+int kgsl_pwrctrl_wake(struct kgsl_device *device);
 void kgsl_pwrctrl_pwrlevel_change(struct kgsl_device *device,
 	unsigned int level);
 int kgsl_pwrctrl_init_sysfs(struct kgsl_device *device);
@@ -109,4 +108,10 @@ static inline unsigned long kgsl_get_clkrate(struct clk *clk)
 
 void kgsl_pwrctrl_set_state(struct kgsl_device *device, unsigned int state);
 void kgsl_pwrctrl_request_state(struct kgsl_device *device, unsigned int state);
+
+int kgsl_active_count_get(struct kgsl_device *device);
+int kgsl_active_count_get_light(struct kgsl_device *device);
+void kgsl_active_count_put(struct kgsl_device *device);
+void kgsl_active_count_wait(struct kgsl_device *device);
+
 #endif /* __KGSL_PWRCTRL_H */
diff --git a/drivers/gpu/msm/kgsl_pwrscale.c b/drivers/gpu/msm/kgsl_pwrscale.c
index dffae7016e162c343db44100f14370dfd3ca9b8b..4f7dc5cc2717ae35ed22545efbeaaf0afb7f5636 100644
--- a/drivers/gpu/msm/kgsl_pwrscale.c
+++ b/drivers/gpu/msm/kgsl_pwrscale.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -241,6 +241,7 @@ void kgsl_pwrscale_busy(struct kgsl_device *device)
 			device->pwrscale.policy->busy(device,
 					&device->pwrscale);
 }
+EXPORT_SYMBOL(kgsl_pwrscale_busy);
 
 void kgsl_pwrscale_idle(struct kgsl_device *device)
 {
diff --git a/drivers/gpu/msm/kgsl_sharedmem.c b/drivers/gpu/msm/kgsl_sharedmem.c
index bbab3c2f0fb7d855571c626a594b862e10726a5d..9329846bac80325967279bbc40b57c13608c6e33 100755
--- a/drivers/gpu/msm/kgsl_sharedmem.c
+++ b/drivers/gpu/msm/kgsl_sharedmem.c
@@ -65,14 +65,6 @@ struct mem_entry_stats {
 		mem_entry_max_show), \
 }
 
-
-/*
- * One page allocation for a guard region to protect against over-zealous
- * GPU pre-fetch
- */
-
-static struct page *kgsl_guard_page;
-
 /**
  * Given a kobj, find the process structure attached to it
  */
@@ -244,6 +236,29 @@ static int kgsl_drv_histogram_show(struct device *dev,
 	return len;
 }
 
+static int kgsl_drv_full_cache_threshold_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	int ret;
+	unsigned int thresh;
+	ret = sscanf(buf, "%d", &thresh);
+	if (ret != 1)
+		return count;
+
+	kgsl_driver.full_cache_threshold = thresh;
+
+	return count;
+}
+
+static int kgsl_drv_full_cache_threshold_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+			kgsl_driver.full_cache_threshold);
+}
+
 DEVICE_ATTR(vmalloc, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(vmalloc_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(page_alloc, 0444, kgsl_drv_memstat_show, NULL);
@@ -253,6 +268,9 @@ DEVICE_ATTR(coherent_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(mapped, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(mapped_max, 0444, kgsl_drv_memstat_show, NULL);
 DEVICE_ATTR(histogram, 0444, kgsl_drv_histogram_show, NULL);
+DEVICE_ATTR(full_cache_threshold, 0644,
+		kgsl_drv_full_cache_threshold_show,
+		kgsl_drv_full_cache_threshold_store);
 
 static const struct device_attribute *drv_attr_list[] = {
 	&dev_attr_vmalloc,
@@ -264,6 +282,7 @@ static const struct device_attribute *drv_attr_list[] = {
 	&dev_attr_mapped,
 	&dev_attr_mapped_max,
 	&dev_attr_histogram,
+	&dev_attr_full_cache_threshold,
 	NULL
 };
 
@@ -366,10 +385,6 @@ static void kgsl_page_alloc_free(struct kgsl_memdesc *memdesc)
 	struct scatterlist *sg;
 	int sglen = memdesc->sglen;
 
-	/* Don't free the guard page if it was used */
-	if (memdesc->priv & KGSL_MEMDESC_GUARD_PAGE)
-		sglen--;
-
 	kgsl_driver.stats.page_alloc -= memdesc->size;
 
 	if (memdesc->hostptr) {
@@ -407,10 +422,6 @@ static int kgsl_page_alloc_map_kernel(struct kgsl_memdesc *memdesc)
 		int sglen = memdesc->sglen;
 		int i, count = 0;
 
-		/* Don't map the guard page if it exists */
-		if (memdesc->priv & KGSL_MEMDESC_GUARD_PAGE)
-			sglen--;
-
 		/* create a list of pages to call vmap */
 		pages = vmalloc(npages * sizeof(struct page *));
 		if (!pages) {
@@ -568,14 +579,6 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 
 	sglen_alloc = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	/*
-	 * Add guard page to the end of the allocation when the
-	 * IOMMU is in use.
-	 */
-
-	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU)
-		sglen_alloc++;
-
 	memdesc->size = size;
 	memdesc->pagetable = pagetable;
 	memdesc->ops = &kgsl_page_alloc_ops;
@@ -648,26 +651,6 @@ _kgsl_sharedmem_page_alloc(struct kgsl_memdesc *memdesc,
 		len -= page_size;
 	}
 
-	/* Add the guard page to the end of the sglist */
-
-	if (kgsl_mmu_get_mmutype() == KGSL_MMU_TYPE_IOMMU) {
-		/*
-		 * It doesn't matter if we use GFP_ZERO here, this never
-		 * gets mapped, and we only allocate it once in the life
-		 * of the system
-		 */
-
-		if (kgsl_guard_page == NULL)
-			kgsl_guard_page = alloc_page(GFP_KERNEL | __GFP_ZERO |
-				__GFP_HIGHMEM);
-
-		if (kgsl_guard_page != NULL) {
-			sg_set_page(&memdesc->sg[sglen++], kgsl_guard_page,
-				PAGE_SIZE, 0);
-			memdesc->priv |= KGSL_MEMDESC_GUARD_PAGE;
-		}
-	}
-
 	memdesc->sglen = sglen;
 
 	/*
diff --git a/drivers/gpu/msm/kgsl_sharedmem.h b/drivers/gpu/msm/kgsl_sharedmem.h
index e31b8f3d88eac2e038c2603cf57d22c0e025c7da..c000cbb6df63552f1672478d332a43b3918dcbbb 100644
--- a/drivers/gpu/msm/kgsl_sharedmem.h
+++ b/drivers/gpu/msm/kgsl_sharedmem.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -19,6 +19,7 @@
 #include "kgsl_mmu.h"
 #include <linux/slab.h>
 #include <linux/kmemleak.h>
+#include <linux/iommu.h>
 
 #include "kgsl_log.h"
 
@@ -200,15 +201,24 @@ kgsl_memdesc_has_guard_page(const struct kgsl_memdesc *memdesc)
 /*
  * kgsl_memdesc_protflags - get mmu protection flags
  * @memdesc - the memdesc
- * Returns a mask of GSL_PT_PAGE* values based on the
- * memdesc flags.
+ * Returns a mask of GSL_PT_PAGE* or IOMMU* values based
+ * on the memdesc flags.
  */
 static inline unsigned int
 kgsl_memdesc_protflags(const struct kgsl_memdesc *memdesc)
 {
-	unsigned int protflags = GSL_PT_PAGE_RV;
-	if (!(memdesc->flags & KGSL_MEMFLAGS_GPUREADONLY))
-		protflags |= GSL_PT_PAGE_WV;
+	unsigned int protflags = 0;
+	enum kgsl_mmutype mmutype = kgsl_mmu_get_mmutype();
+
+	if (mmutype == KGSL_MMU_TYPE_GPU) {
+		protflags = GSL_PT_PAGE_RV;
+		if (!(memdesc->flags & KGSL_MEMFLAGS_GPUREADONLY))
+			protflags |= GSL_PT_PAGE_WV;
+	} else if (mmutype == KGSL_MMU_TYPE_IOMMU) {
+		protflags = IOMMU_READ;
+		if (!(memdesc->flags & KGSL_MEMFLAGS_GPUREADONLY))
+			protflags |= IOMMU_WRITE;
+	}
 	return protflags;
 }
 
@@ -253,8 +263,7 @@ kgsl_allocate(struct kgsl_memdesc *memdesc,
 	ret = kgsl_sharedmem_page_alloc(memdesc, pagetable, size);
 	if (ret)
 		return ret;
-	ret = kgsl_mmu_map(pagetable, memdesc,
-			   kgsl_memdesc_protflags(memdesc));
+	ret = kgsl_mmu_map(pagetable, memdesc);
 	if (ret)
 		kgsl_sharedmem_free(memdesc);
 	return ret;
@@ -291,15 +300,4 @@ kgsl_allocate_contiguous(struct kgsl_memdesc *memdesc, size_t size)
 	return ret;
 }
 
-static inline int kgsl_sg_size(struct scatterlist *sg, int sglen)
-{
-	int i, size = 0;
-	struct scatterlist *s;
-
-	for_each_sg(sg, s, sglen, i) {
-		size += s->length;
-	}
-
-	return size;
-}
 #endif /* __KGSL_SHAREDMEM_H */
diff --git a/drivers/gpu/msm/kgsl_snapshot.c b/drivers/gpu/msm/kgsl_snapshot.c
index e20029453c45346da9067acc82d5eda363813031..50ac9c1bd3417fe32c4bb776d0dee30028a96176 100644
--- a/drivers/gpu/msm/kgsl_snapshot.c
+++ b/drivers/gpu/msm/kgsl_snapshot.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -106,7 +106,12 @@ static int snapshot_context_info(int id, void *ptr, void *data)
 {
 	struct kgsl_snapshot_linux_context *header = _ctxtptr;
 	struct kgsl_context *context = ptr;
-	struct kgsl_device *device = context->dev_priv->device;
+	struct kgsl_device *device;
+
+	if (context)
+		device = context->device;
+	else
+		device = (struct kgsl_device *)data;
 
 	header->id = id;
 
@@ -139,9 +144,12 @@ static int snapshot_os(struct kgsl_device *device,
 	/* Figure out how many active contexts there are - these will
 	 * be appended on the end of the structure */
 
-	rcu_read_lock();
+	read_lock(&device->context_lock);
 	idr_for_each(&device->context_idr, snapshot_context_count, &ctxtcount);
-	rcu_read_unlock();
+	read_unlock(&device->context_lock);
+
+	/* Increment ctxcount for the global memstore */
+	ctxtcount++;
 
 	size += ctxtcount * sizeof(struct kgsl_snapshot_linux_context);
 
@@ -171,8 +179,9 @@ static int snapshot_os(struct kgsl_device *device,
 	header->grpclk = kgsl_get_clkrate(pwr->grp_clks[0]);
 	header->busclk = kgsl_get_clkrate(pwr->ebi1_clk);
 
-	/* Future proof for per-context timestamps */
-	header->current_context = -1;
+	/* Save the last active context */
+	kgsl_sharedmem_readl(&device->memstore, &header->current_context,
+		KGSL_MEMSTORE_OFFSET(KGSL_MEMSTORE_GLOBAL, current_context));
 
 	/* Get the current PT base */
 	header->ptbase = kgsl_mmu_get_current_ptbase(&device->mmu);
@@ -187,11 +196,17 @@ static int snapshot_os(struct kgsl_device *device,
 
 	header->ctxtcount = ctxtcount;
 
-	/* append information for each context */
 	_ctxtptr = snapshot + sizeof(*header);
-	rcu_read_lock();
+
+	/* append information for the global context */
+	snapshot_context_info(KGSL_MEMSTORE_GLOBAL, NULL, device);
+
+	/* append information for each context */
+
+	read_lock(&device->context_lock);
 	idr_for_each(&device->context_idr, snapshot_context_info, NULL);
-	rcu_read_unlock();
+	read_unlock(&device->context_lock);
+
 	/* Return the size of the data segment */
 	return size;
 }
@@ -286,7 +301,7 @@ static void kgsl_snapshot_put_object(struct kgsl_device *device,
 {
 	list_del(&obj->node);
 
-	obj->entry->flags &= ~KGSL_MEM_ENTRY_FROZEN;
+	obj->entry->memdesc.priv &= ~KGSL_MEMDESC_FROZEN;
 	kgsl_mem_entry_put(obj->entry);
 
 	kfree(obj);
@@ -317,6 +332,7 @@ int kgsl_snapshot_have_object(struct kgsl_device *device, unsigned int ptbase,
 
 	return 0;
 }
+EXPORT_SYMBOL(kgsl_snapshot_have_object);
 
 /* kgsl_snapshot_get_object - Mark a GPU buffer to be frozen
  * @device - the device that is being snapshotted
@@ -336,6 +352,10 @@ int kgsl_snapshot_get_object(struct kgsl_device *device, unsigned int ptbase,
 	struct kgsl_mem_entry *entry;
 	struct kgsl_snapshot_object *obj;
 	int offset;
+	int ret = -EINVAL;
+
+	if (!gpuaddr)
+		return 0;
 
 	entry = kgsl_get_mem_entry(device, ptbase, gpuaddr, size);
 
@@ -349,7 +369,7 @@ int kgsl_snapshot_get_object(struct kgsl_device *device, unsigned int ptbase,
 	if (entry->memtype != KGSL_MEM_ENTRY_KERNEL) {
 		KGSL_DRV_ERR(device,
 			"Only internal GPU buffers can be frozen\n");
-		return -EINVAL;
+		goto err_put;
 	}
 
 	/*
@@ -372,36 +392,33 @@ int kgsl_snapshot_get_object(struct kgsl_device *device, unsigned int ptbase,
 	if (size + offset > entry->memdesc.size) {
 		KGSL_DRV_ERR(device, "Invalid size for GPU buffer %8.8X\n",
 				gpuaddr);
-		return -EINVAL;
+		goto err_put;
 	}
 
 	/* If the buffer is already on the list, skip it */
 	list_for_each_entry(obj, &device->snapshot_obj_list, node) {
 		if (obj->gpuaddr == gpuaddr && obj->ptbase == ptbase) {
-			/* If the size is different, use the new size */
-			if (obj->size != size)
+			/* If the size is different, use the bigger size */
+			if (obj->size < size)
 				obj->size = size;
-
-			return 0;
+			ret = 0;
+			goto err_put;
 		}
 	}
 
 	if (kgsl_memdesc_map(&entry->memdesc) == NULL) {
 		KGSL_DRV_ERR(device, "Unable to map GPU buffer %X\n",
 				gpuaddr);
-		return -EINVAL;
+		goto err_put;
 	}
 
 	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
 
 	if (obj == NULL) {
 		KGSL_DRV_ERR(device, "Unable to allocate memory\n");
-		return -EINVAL;
+		goto err_put;
 	}
 
-	/* Ref count the mem entry */
-	kgsl_mem_entry_get(entry);
-
 	obj->type = type;
 	obj->entry = entry;
 	obj->gpuaddr = gpuaddr;
@@ -419,12 +436,15 @@ int kgsl_snapshot_get_object(struct kgsl_device *device, unsigned int ptbase,
 	 * 0 so it doesn't get counted twice
 	 */
 
-	if (entry->flags & KGSL_MEM_ENTRY_FROZEN)
-		return 0;
+	ret = (entry->memdesc.priv & KGSL_MEMDESC_FROZEN) ? 0
+		: entry->memdesc.size;
 
-	entry->flags |= KGSL_MEM_ENTRY_FROZEN;
+	entry->memdesc.priv |= KGSL_MEMDESC_FROZEN;
 
-	return entry->memdesc.size;
+	return ret;
+err_put:
+	kgsl_mem_entry_put(entry);
+	return ret;
 }
 EXPORT_SYMBOL(kgsl_snapshot_get_object);
 
diff --git a/drivers/gpu/msm/kgsl_sync.c b/drivers/gpu/msm/kgsl_sync.c
index 8ee076d8e47fb462d8698a0e6d06f0d39bf09729..b74d4604d14ac22cff946d8205776ff1fda837ef 100644
--- a/drivers/gpu/msm/kgsl_sync.c
+++ b/drivers/gpu/msm/kgsl_sync.c
@@ -11,6 +11,7 @@
  *
  */
 
+#include <linux/err.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
@@ -225,3 +226,65 @@ void kgsl_sync_timeline_destroy(struct kgsl_context *context)
 {
 	sync_timeline_destroy(context->timeline);
 }
+
+static void kgsl_sync_callback(struct sync_fence *fence,
+	struct sync_fence_waiter *waiter)
+{
+	struct kgsl_sync_fence_waiter *kwaiter =
+		(struct kgsl_sync_fence_waiter *) waiter;
+	kwaiter->func(kwaiter->priv);
+	sync_fence_put(kwaiter->fence);
+	kfree(kwaiter);
+}
+
+struct kgsl_sync_fence_waiter *kgsl_sync_fence_async_wait(int fd,
+	void (*func)(void *priv), void *priv)
+{
+	struct kgsl_sync_fence_waiter *kwaiter;
+	struct sync_fence *fence;
+	int status;
+
+	fence = sync_fence_fdget(fd);
+	if (fence == NULL)
+		return ERR_PTR(-EINVAL);
+
+	/* create the waiter */
+	kwaiter = kzalloc(sizeof(*kwaiter), GFP_KERNEL);
+	if (kwaiter == NULL) {
+		sync_fence_put(fence);
+		return ERR_PTR(-ENOMEM);
+	}
+	kwaiter->fence = fence;
+	kwaiter->priv = priv;
+	kwaiter->func = func;
+	sync_fence_waiter_init((struct sync_fence_waiter *) kwaiter,
+		kgsl_sync_callback);
+
+	/* if status then error or signaled */
+	status = sync_fence_wait_async(fence,
+		(struct sync_fence_waiter *) kwaiter);
+	if (status) {
+		kfree(kwaiter);
+		sync_fence_put(fence);
+		if (status < 0)
+			kwaiter = ERR_PTR(status);
+		else
+			kwaiter = NULL;
+	}
+
+	return kwaiter;
+}
+
+int kgsl_sync_fence_async_cancel(struct kgsl_sync_fence_waiter *kwaiter)
+{
+	if (kwaiter == NULL)
+		return 0;
+
+	if(sync_fence_cancel_async(kwaiter->fence,
+		(struct sync_fence_waiter *) kwaiter) == 0) {
+		sync_fence_put(kwaiter->fence);
+		kfree(kwaiter);
+		return 1;
+	}
+	return 0;
+}
diff --git a/drivers/gpu/msm/kgsl_sync.h b/drivers/gpu/msm/kgsl_sync.h
index 06b3ad0d89188316ddb2d61ed99381f63b5cf171..2f28b21fc6dd84fb48736ed0a24a773e4dc56020 100644
--- a/drivers/gpu/msm/kgsl_sync.h
+++ b/drivers/gpu/msm/kgsl_sync.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -26,6 +26,13 @@ struct kgsl_sync_pt {
 	unsigned int timestamp;
 };
 
+struct kgsl_sync_fence_waiter {
+	struct sync_fence_waiter waiter;
+	struct sync_fence *fence;
+	void (*func)(void *priv);
+	void *priv;
+};
+
 #if defined(CONFIG_SYNC)
 struct sync_pt *kgsl_sync_pt_create(struct sync_timeline *timeline,
 	unsigned int timestamp);
@@ -37,6 +44,9 @@ int kgsl_sync_timeline_create(struct kgsl_context *context);
 void kgsl_sync_timeline_signal(struct sync_timeline *timeline,
 	unsigned int timestamp);
 void kgsl_sync_timeline_destroy(struct kgsl_context *context);
+struct kgsl_sync_fence_waiter *kgsl_sync_fence_async_wait(int fd,
+	void (*func)(void *priv), void *priv);
+int kgsl_sync_fence_async_cancel(struct kgsl_sync_fence_waiter *waiter);
 #else
 static inline struct sync_pt
 *kgsl_sync_pt_create(struct sync_timeline *timeline, unsigned int timestamp)
@@ -70,6 +80,20 @@ kgsl_sync_timeline_signal(struct sync_timeline *timeline,
 static inline void kgsl_sync_timeline_destroy(struct kgsl_context *context)
 {
 }
+
+static inline struct
+kgsl_sync_fence_waiter *kgsl_sync_fence_async_wait(int fd,
+	void (*func)(void *priv), void *priv)
+{
+	return NULL;
+}
+
+static inline int
+kgsl_sync_fence_async_cancel(struct kgsl_sync_fence_waiter *waiter)
+{
+	return 1;
+}
+
 #endif
 
 #endif /* __KGSL_SYNC_H */
diff --git a/drivers/gpu/msm/kgsl_trace.h b/drivers/gpu/msm/kgsl_trace.h
index 8c62739eed995ffb1d64f81155ce5fb7ee92581a..b55075935db729f44f1a2db669bcbe5265b9d132 100644
--- a/drivers/gpu/msm/kgsl_trace.h
+++ b/drivers/gpu/msm/kgsl_trace.h
@@ -37,14 +37,13 @@ TRACE_EVENT(kgsl_issueibcmds,
 
 	TP_PROTO(struct kgsl_device *device,
 			int drawctxt_id,
-			struct kgsl_ibdesc *ibdesc,
-			int numibs,
+			struct kgsl_cmdbatch *cmdbatch,
 			int timestamp,
 			int flags,
 			int result,
 			unsigned int type),
 
-	TP_ARGS(device, drawctxt_id, ibdesc, numibs, timestamp, flags,
+	TP_ARGS(device, drawctxt_id, cmdbatch, timestamp, flags,
 		result, type),
 
 	TP_STRUCT__entry(
@@ -61,8 +60,8 @@ TRACE_EVENT(kgsl_issueibcmds,
 	TP_fast_assign(
 		__assign_str(device_name, device->name);
 		__entry->drawctxt_id = drawctxt_id;
-		__entry->ibdesc_addr = ibdesc[0].gpuaddr;
-		__entry->numibs = numibs;
+		__entry->ibdesc_addr = cmdbatch->ibdesc[0].gpuaddr;
+		__entry->numibs = cmdbatch->ibcount;
 		__entry->timestamp = timestamp;
 		__entry->flags = flags;
 		__entry->result = result;
@@ -479,6 +478,67 @@ TRACE_EVENT(kgsl_mem_free,
 	)
 );
 
+TRACE_EVENT(kgsl_mem_sync_cache,
+
+	TP_PROTO(struct kgsl_mem_entry *mem_entry, unsigned int op),
+
+	TP_ARGS(mem_entry, op),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, gpuaddr)
+		__field(unsigned int, size)
+		__array(char, usage, 16)
+		__field(unsigned int, tgid)
+		__field(unsigned int, id)
+		__field(unsigned int, op)
+	),
+
+	TP_fast_assign(
+		__entry->gpuaddr = mem_entry->memdesc.gpuaddr;
+		__entry->size = mem_entry->memdesc.size;
+		__entry->tgid = mem_entry->priv->pid;
+		__entry->id = mem_entry->id;
+		kgsl_get_memory_usage(__entry->usage, sizeof(__entry->usage),
+				     mem_entry->memdesc.flags);
+		__entry->op = op;
+	),
+
+	TP_printk(
+		"gpuaddr=0x%08x size=%d tgid=%d usage=%s id=%d op=%c%c",
+		__entry->gpuaddr, __entry->size, __entry->tgid, __entry->usage,
+		__entry->id,
+		(__entry->op & KGSL_GPUMEM_CACHE_CLEAN) ? 'c' : '.',
+		(__entry->op & KGSL_GPUMEM_CACHE_INV) ? 'i' : '.'
+	)
+);
+
+TRACE_EVENT(kgsl_mem_sync_full_cache,
+
+	TP_PROTO(unsigned int num_bufs, unsigned int bulk_size,
+		unsigned int op),
+
+	TP_ARGS(num_bufs, bulk_size, op),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, num_bufs)
+		__field(unsigned int, bulk_size)
+		__field(unsigned int, op)
+	),
+
+	TP_fast_assign(
+		__entry->num_bufs = num_bufs;
+		__entry->bulk_size = bulk_size;
+		__entry->op = op;
+	),
+
+	TP_printk(
+		"num_bufs=%d bulk_size=%d op=%c%c",
+		__entry->num_bufs, __entry->bulk_size,
+		(__entry->op & KGSL_GPUMEM_CACHE_CLEAN) ? 'c' : '.',
+		(__entry->op & KGSL_GPUMEM_CACHE_INV) ? 'i' : '.'
+	)
+);
+
 DECLARE_EVENT_CLASS(kgsl_mem_timestamp_template,
 
 	TP_PROTO(struct kgsl_device *device, struct kgsl_mem_entry *mem_entry,
@@ -591,6 +651,28 @@ TRACE_EVENT(kgsl_context_detach,
 	)
 );
 
+TRACE_EVENT(kgsl_context_destroy,
+
+	TP_PROTO(struct kgsl_device *device, struct kgsl_context *context),
+
+	TP_ARGS(device, context),
+
+	TP_STRUCT__entry(
+		__string(device_name, device->name)
+		__field(unsigned int, id)
+	),
+
+	TP_fast_assign(
+		__assign_str(device_name, device->name);
+		__entry->id = context->id;
+	),
+
+	TP_printk(
+		"d_name=%s ctx=%u",
+		__get_str(device_name), __entry->id
+	)
+);
+
 TRACE_EVENT(kgsl_mmu_pagefault,
 
 	TP_PROTO(struct kgsl_device *device, unsigned int page,
@@ -681,6 +763,30 @@ TRACE_EVENT(kgsl_fire_event,
 			__entry->id, __entry->ts, __entry->type, __entry->age)
 );
 
+TRACE_EVENT(kgsl_active_count,
+
+	TP_PROTO(struct kgsl_device *device, unsigned long ip),
+
+	TP_ARGS(device, ip),
+
+	TP_STRUCT__entry(
+		__string(device_name, device->name)
+		__field(unsigned int, count)
+		__field(unsigned long, ip)
+	),
+
+	TP_fast_assign(
+		__assign_str(device_name, device->name);
+		__entry->count = atomic_read(&device->active_cnt);
+		__entry->ip = ip;
+	),
+
+	TP_printk(
+		"d_name=%s active_cnt=%x func=%pf",
+		__get_str(device_name), __entry->count, (void *) __entry->ip
+	)
+);
+
 #endif /* _KGSL_TRACE_H */
 
 /* This part must be outside protection */
diff --git a/drivers/gpu/msm/z180.c b/drivers/gpu/msm/z180.c
index c62f67b4a5c68fad7af6e29969e66091b8ccae88..9cebacec0484c3a2c5b2a0150af3ab8fce92c972 100644
--- a/drivers/gpu/msm/z180.c
+++ b/drivers/gpu/msm/z180.c
@@ -17,7 +17,6 @@
 #include "kgsl.h"
 #include "kgsl_cffdump.h"
 #include "kgsl_sharedmem.h"
-#include "kgsl_trace.h"
 
 #include "z180.h"
 #include "z180_reg.h"
@@ -94,7 +93,8 @@ enum z180_cmdwindow_type {
 #define Z180_CMDWINDOW_TARGET_SHIFT		0
 #define Z180_CMDWINDOW_ADDR_SHIFT		8
 
-static int z180_start(struct kgsl_device *device, unsigned int init_ram);
+static int z180_init(struct kgsl_device *device);
+static int z180_start(struct kgsl_device *device);
 static int z180_stop(struct kgsl_device *device);
 static int z180_wait(struct kgsl_device *device,
 				struct kgsl_context *context,
@@ -245,20 +245,17 @@ static int z180_setup_pt(struct kgsl_device *device,
 	int result = 0;
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 
-	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->mmu.setstate_memory);
 
 	if (result)
 		goto error;
 
-	result = kgsl_mmu_map_global(pagetable, &device->memstore,
-				     GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+	result = kgsl_mmu_map_global(pagetable, &device->memstore);
 	if (result)
 		goto error_unmap_dummy;
 
 	result = kgsl_mmu_map_global(pagetable,
-				     &z180_dev->ringbuffer.cmdbufdesc,
-				     GSL_PT_PAGE_RV);
+				     &z180_dev->ringbuffer.cmdbufdesc);
 	if (result)
 		goto error_unmap_memstore;
 	/*
@@ -323,16 +320,11 @@ static void addcmd(struct z180_ringbuffer *rb, unsigned int timestamp,
 	*p++ = ADDR_VGV3_LAST << 24;
 }
 
-static void z180_cmdstream_start(struct kgsl_device *device, int init_ram)
+static void z180_cmdstream_start(struct kgsl_device *device)
 {
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 	unsigned int cmd = VGV3_NEXTCMD_JUMP << VGV3_NEXTCMD_NEXTCMD_FSHIFT;
 
-	if (init_ram) {
-		z180_dev->timestamp = 0;
-		z180_dev->current_timestamp = 0;
-	}
-
 	addmarker(&z180_dev->ringbuffer, 0);
 
 	z180_cmdwindow_write(device, ADDR_VGV3_MODE, 4);
@@ -362,7 +354,13 @@ static int room_in_rb(struct z180_device *device)
 	return ts_diff < Z180_PACKET_COUNT;
 }
 
-static int z180_idle(struct kgsl_device *device)
+/**
+ * z180_idle() - Idle the 2D device
+ * @device: Pointer to the KGSL device struct for the Z180
+ *
+ * wait until the z180 submission queue is idle
+ */
+int z180_idle(struct kgsl_device *device)
 {
 	int status = 0;
 	struct z180_device *z180_dev = Z180_DEVICE(device);
@@ -382,10 +380,8 @@ static int z180_idle(struct kgsl_device *device)
 int
 z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 			struct kgsl_context *context,
-			struct kgsl_ibdesc *ibdesc,
-			unsigned int numibs,
-			uint32_t *timestamp,
-			unsigned int ctrl)
+			struct kgsl_cmdbatch *cmdbatch,
+			uint32_t *timestamp)
 {
 	long result = 0;
 	unsigned int ofs        = PACKETSIZE_STATESTREAM * sizeof(unsigned int);
@@ -398,6 +394,20 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	struct kgsl_pagetable *pagetable = dev_priv->process_priv->pagetable;
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 	unsigned int sizedwords;
+	unsigned int numibs;
+	struct kgsl_ibdesc *ibdesc;
+
+	mutex_lock(&device->mutex);
+
+	kgsl_active_count_get(device);
+
+	if (cmdbatch == NULL) {
+		result = EINVAL;
+		goto error;
+	}
+
+	ibdesc = cmdbatch->ibdesc;
+	numibs = cmdbatch->ibcount;
 
 	if (device->state & KGSL_STATE_HUNG) {
 		result = -EINVAL;
@@ -439,7 +449,7 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 		context->id, cmd, sizedwords);
 	/* context switch */
 	if ((context->id != (int)z180_dev->ringbuffer.prevctx) ||
-	    (ctrl & KGSL_CONTEXT_CTX_SWITCH)) {
+	    (cmdbatch->flags & KGSL_CONTEXT_CTX_SWITCH)) {
 		KGSL_CMD_INFO(device, "context switch %d -> %d\n",
 			context->id, z180_dev->ringbuffer.prevctx);
 		kgsl_mmu_setstate(&device->mmu, pagetable,
@@ -447,10 +457,13 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 		cnt = PACKETSIZE_STATESTREAM;
 		ofs = 0;
 	}
-	kgsl_setstate(&device->mmu,
+
+	result = kgsl_setstate(&device->mmu,
 			KGSL_MEMSTORE_GLOBAL,
 			kgsl_mmu_pt_get_flags(device->mmu.hwpagetable,
 			device->id));
+	if (result < 0)
+		goto error;
 
 	result = wait_event_interruptible_timeout(device->wait_queue,
 				  room_in_rb(z180_dev),
@@ -491,9 +504,12 @@ z180_cmdstream_issueibcmds(struct kgsl_device_private *dev_priv,
 	z180_cmdwindow_write(device, ADDR_VGV3_CONTROL, cmd);
 	z180_cmdwindow_write(device, ADDR_VGV3_CONTROL, 0);
 error:
+	kgsl_trace_issueibcmds(device, context->id, cmdbatch,
+		*timestamp, cmdbatch->flags, result, 0);
 
-	trace_kgsl_issueibcmds(device, context->id, ibdesc, numibs,
-		*timestamp, ctrl, result, 0);
+	kgsl_active_count_put(device);
+
+	mutex_unlock(&device->mutex);
 
 	return (int)result;
 }
@@ -503,6 +519,7 @@ static int z180_ringbuffer_init(struct kgsl_device *device)
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 	memset(&z180_dev->ringbuffer, 0, sizeof(struct z180_ringbuffer));
 	z180_dev->ringbuffer.prevctx = Z180_INVALID_CONTEXT;
+	z180_dev->ringbuffer.cmdbufdesc.flags = KGSL_MEMFLAGS_GPUREADONLY;
 	return kgsl_allocate_contiguous(&z180_dev->ringbuffer.cmdbufdesc,
 		Z180_RB_SIZE);
 }
@@ -559,7 +576,17 @@ static int __devexit z180_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static int z180_start(struct kgsl_device *device, unsigned int init_ram)
+static int z180_init(struct kgsl_device *device)
+{
+	struct z180_device *z180_dev = Z180_DEVICE(device);
+
+	z180_dev->timestamp = 0;
+	z180_dev->current_timestamp = 0;
+
+	return 0;
+}
+
+static int z180_start(struct kgsl_device *device)
 {
 	int status = 0;
 
@@ -576,7 +603,7 @@ static int z180_start(struct kgsl_device *device, unsigned int init_ram)
 	if (status)
 		goto error_clk_off;
 
-	z180_cmdstream_start(device, init_ram);
+	z180_cmdstream_start(device);
 
 	mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
 	kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_ON);
@@ -661,7 +688,7 @@ static int z180_getproperty(struct kgsl_device *device,
 	return status;
 }
 
-static unsigned int z180_isidle(struct kgsl_device *device)
+static bool z180_isidle(struct kgsl_device *device)
 {
 	struct z180_device *z180_dev = Z180_DEVICE(device);
 
@@ -822,9 +849,9 @@ static int z180_waittimestamp(struct kgsl_device *device,
 {
 	int status = -EINVAL;
 
-	/* Don't wait forever, set a max (10 sec) value for now */
+	/* Don't wait forever, set a max of Z180_IDLE_TIMEOUT */
 	if (msecs == -1)
-		msecs = 10 * MSEC_PER_SEC;
+		msecs = Z180_IDLE_TIMEOUT;
 
 	mutex_unlock(&device->mutex);
 	status = z180_wait(device, context, timestamp, msecs);
@@ -858,11 +885,30 @@ static int z180_wait(struct kgsl_device *device,
 	return status;
 }
 
-static void
-z180_drawctxt_destroy(struct kgsl_device *device,
-			  struct kgsl_context *context)
+struct kgsl_context *
+z180_drawctxt_create(struct kgsl_device_private *dev_priv,
+			uint32_t *flags)
 {
-	struct z180_device *z180_dev = Z180_DEVICE(device);
+	int ret;
+	struct kgsl_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (context == NULL)
+		return ERR_PTR(-ENOMEM);
+	ret = kgsl_context_init(dev_priv, context);
+	if (ret != 0) {
+		kfree(context);
+		return ERR_PTR(ret);
+	}
+	return context;
+}
+
+static int
+z180_drawctxt_detach(struct kgsl_context *context)
+{
+	struct kgsl_device *device;
+	struct z180_device *z180_dev;
+
+	device = context->device;
+	z180_dev = Z180_DEVICE(device);
 
 	z180_idle(device);
 
@@ -872,6 +918,14 @@ z180_drawctxt_destroy(struct kgsl_device *device,
 		kgsl_setstate(&device->mmu, KGSL_MEMSTORE_GLOBAL,
 				KGSL_MMUFLAGS_PTUPDATE);
 	}
+
+	return 0;
+}
+
+static void
+z180_drawctxt_destroy(struct kgsl_context *context)
+{
+	kfree(context);
 }
 
 static void z180_power_stats(struct kgsl_device *device,
@@ -926,6 +980,7 @@ static const struct kgsl_functable z180_functable = {
 	.idle = z180_idle,
 	.isidle = z180_isidle,
 	.suspend_context = z180_suspend_context,
+	.init = z180_init,
 	.start = z180_start,
 	.stop = z180_stop,
 	.getproperty = z180_getproperty,
@@ -938,8 +993,10 @@ static const struct kgsl_functable z180_functable = {
 	.irqctrl = z180_irqctrl,
 	.gpuid = z180_gpuid,
 	.irq_handler = z180_irq_handler,
+	.drain = z180_idle, /* drain == idle for the z180 */
 	/* Optional functions */
-	.drawctxt_create = NULL,
+	.drawctxt_create = z180_drawctxt_create,
+	.drawctxt_detach = z180_drawctxt_detach,
 	.drawctxt_destroy = z180_drawctxt_destroy,
 	.ioctl = NULL,
 	.postmortem_dump = z180_dump,
diff --git a/drivers/gpu/msm/z180.h b/drivers/gpu/msm/z180.h
index 268aac3efe686b2d1ba12a2740e0dd7240608df1..a36e92d864fd47c2e93b3a4c7203e72b346d63a1 100644
--- a/drivers/gpu/msm/z180.h
+++ b/drivers/gpu/msm/z180.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2008-2012, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2008-2013, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -29,7 +29,7 @@
 #define Z180_DEFAULT_PWRSCALE_POLICY  NULL
 
 /* Wait a maximum of 10 seconds when trying to idle the core */
-#define Z180_IDLE_TIMEOUT (10 * 1000)
+#define Z180_IDLE_TIMEOUT (20 * 1000)
 
 struct z180_ringbuffer {
 	unsigned int prevctx;
@@ -45,5 +45,6 @@ struct z180_device {
 };
 
 int z180_dump(struct kgsl_device *, int);
+int z180_idle(struct kgsl_device *);
 
 #endif /* __Z180_H */
diff --git a/drivers/gpu/msm/z180_postmortem.c b/drivers/gpu/msm/z180_postmortem.c
index c1e5f07cf0897b0b70badf5483eaae4cadf24842..03ebdb572d8bcd13421808cdfefe68f486039c57 100644
--- a/drivers/gpu/msm/z180_postmortem.c
+++ b/drivers/gpu/msm/z180_postmortem.c
@@ -58,6 +58,8 @@ static void z180_dump_regs(struct kgsl_device *device)
 	unsigned int i;
 	unsigned int reg_val;
 
+	z180_idle(device);
+
 	KGSL_LOG_DUMP(device, "Z180 Register Dump\n");
 	for (i = 0; i < ARRAY_SIZE(regs_to_dump); i++) {
 		kgsl_regread(device,
@@ -168,6 +170,7 @@ static void z180_dump_ib(struct kgsl_device *device)
 				KGSL_LOG_DUMP(device,
 				"Could not map IB to kernel memory, Ringbuffer Slot: %d\n",
 				rb_slot_num);
+				kgsl_mem_entry_put(entry);
 				continue;
 			}
 
@@ -190,6 +193,7 @@ static void z180_dump_ib(struct kgsl_device *device)
 						linebuf);
 			}
 			KGSL_LOG_DUMP(device, "IB Dump Finished\n");
+			kgsl_mem_entry_put(entry);
 		}
 	}
 }
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 6200095bc5c404ef2220e02aa9376630241cdbeb..66bfac2458eb298a20d2ac1432f851b516be51bd 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -1,5 +1,4 @@
-/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.
- *
+/* Copyright (c) 2010-2012, The Linux Foundation. All rights reserved.  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
  * only version 2 as published by the Free Software Foundation.
@@ -50,6 +49,9 @@ __asm__ __volatile__ (							\
 #define MSM_IOMMU_ATTR_CACHED_WT	0x3
 
 
+static int msm_iommu_unmap_range(struct iommu_domain *domain, unsigned int va,
+				 unsigned int len);
+
 static inline void clean_pte(unsigned long *start, unsigned long *end,
 			     int redirect)
 {
@@ -907,6 +909,7 @@ static int msm_iommu_map_range(struct iommu_domain *domain, unsigned int va,
 			       int prot)
 {
 	unsigned int pa;
+	unsigned int start_va = va;
 	unsigned int offset = 0;
 	unsigned long *fl_table;
 	unsigned long *fl_pte;
@@ -978,12 +981,6 @@ static int msm_iommu_map_range(struct iommu_domain *domain, unsigned int va,
 				chunk_offset = 0;
 				sg = sg_next(sg);
 				pa = get_phys_addr(sg);
-				if (pa == 0) {
-					pr_debug("No dma address for sg %p\n",
-							sg);
-					ret = -EINVAL;
-					goto fail;
-				}
 			}
 			continue;
 		}
@@ -1037,12 +1034,6 @@ static int msm_iommu_map_range(struct iommu_domain *domain, unsigned int va,
 				chunk_offset = 0;
 				sg = sg_next(sg);
 				pa = get_phys_addr(sg);
-				if (pa == 0) {
-					pr_debug("No dma address for sg %p\n",
-							sg);
-					ret = -EINVAL;
-					goto fail;
-				}
 			}
 		}
 
@@ -1055,6 +1046,8 @@ static int msm_iommu_map_range(struct iommu_domain *domain, unsigned int va,
 	__flush_iotlb(domain);
 fail:
 	mutex_unlock(&msm_iommu_lock);
+	if (ret && offset > 0)
+		msm_iommu_unmap_range(domain, start_va, offset);
 	return ret;
 }
 
diff --git a/drivers/iommu/msm_iommu_pagetable.c b/drivers/iommu/msm_iommu_pagetable.c
index 2ee9ba6a27bdf6fbdccac31616cf343c2217c56b..34bbddc7caff64613dec8dbf2e8c741da3cfe81c 100644
--- a/drivers/iommu/msm_iommu_pagetable.c
+++ b/drivers/iommu/msm_iommu_pagetable.c
@@ -351,11 +351,6 @@ int msm_iommu_pagetable_map_range(struct iommu_pt *pt, unsigned int va,
 	sl_offset = SL_OFFSET(va);
 
 	chunk_pa = get_phys_addr(sg);
-	if (chunk_pa == 0) {
-		pr_debug("No dma address for sg %p\n", sg);
-		ret = -EINVAL;
-		goto fail;
-	}
 
 	while (offset < len) {
 		/* Set up a 2nd level page table if one doesn't exist */
@@ -399,12 +394,6 @@ int msm_iommu_pagetable_map_range(struct iommu_pt *pt, unsigned int va,
 				chunk_offset = 0;
 				sg = sg_next(sg);
 				chunk_pa = get_phys_addr(sg);
-				if (chunk_pa == 0) {
-					pr_debug("No dma address for sg %p\n",
-						sg);
-					ret = -EINVAL;
-					goto fail;
-				}
 			}
 		}
 
diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h
index 29a44de41c7473d29cafdad0a49d0d2a753f76dd..0af811c86ab8f1dc6adfc473b4e4416ae73c1cf8 100644
--- a/include/linux/msm_kgsl.h
+++ b/include/linux/msm_kgsl.h
@@ -12,25 +12,27 @@
 #define KGSL_VERSION_MINOR        14
 
 /*context flags */
-#define KGSL_CONTEXT_SAVE_GMEM		  0x00000001
-#define KGSL_CONTEXT_NO_GMEM_ALLOC	  0x00000002
-#define KGSL_CONTEXT_SUBMIT_IB_LIST	  0x00000004
-#define KGSL_CONTEXT_CTX_SWITCH		  0x00000008
-#define KGSL_CONTEXT_PREAMBLE		  0x00000010
-#define KGSL_CONTEXT_TRASH_STATE	  0x00000020
-#define KGSL_CONTEXT_PER_CONTEXT_TS	  0x00000040
-#define KGSL_CONTEXT_USER_GENERATED_TS	  0x00000080
-#define KGSL_CONTEXT_END_OF_FRAME         0x00000100
-#define KGSL_CONTEXT_NO_FAULT_TOLERANCE	  0x00000200
+#define KGSL_CONTEXT_SAVE_GMEM		0x00000001
+#define KGSL_CONTEXT_NO_GMEM_ALLOC	0x00000002
+#define KGSL_CONTEXT_SUBMIT_IB_LIST	0x00000004
+#define KGSL_CONTEXT_CTX_SWITCH		0x00000008
+#define KGSL_CONTEXT_PREAMBLE		0x00000010
+#define KGSL_CONTEXT_TRASH_STATE	0x00000020
+#define KGSL_CONTEXT_PER_CONTEXT_TS	0x00000040
+#define KGSL_CONTEXT_USER_GENERATED_TS	0x00000080
+#define KGSL_CONTEXT_END_OF_FRAME	0x00000100
+
+#define KGSL_CONTEXT_NO_FAULT_TOLERANCE 0x00000200
+#define KGSL_CONTEXT_SYNC               0x00000400
 /* bits [12:15] are reserved for future use */
-#define KGSL_CONTEXT_TYPE_MASK            0x01F00000
-#define KGSL_CONTEXT_TYPE_SHIFT           20
+#define KGSL_CONTEXT_TYPE_MASK          0x01F00000
+#define KGSL_CONTEXT_TYPE_SHIFT         20
 
-#define KGSL_CONTEXT_TYPE_ANY		  0
-#define KGSL_CONTEXT_TYPE_GL		  1
-#define KGSL_CONTEXT_TYPE_CL		  2
-#define KGSL_CONTEXT_TYPE_C2D		  3
-#define KGSL_CONTEXT_TYPE_RS		  4
+#define KGSL_CONTEXT_TYPE_ANY		0
+#define KGSL_CONTEXT_TYPE_GL		1
+#define KGSL_CONTEXT_TYPE_CL		2
+#define KGSL_CONTEXT_TYPE_C2D		3
+#define KGSL_CONTEXT_TYPE_RS		4
 
 #define KGSL_CONTEXT_INVALID 0xffffffff
 
@@ -194,31 +196,6 @@ enum kgsl_property_type {
 	KGSL_PROP_VERSION         = 0x00000008,
 	KGSL_PROP_GPU_RESET_STAT  = 0x00000009,
 	KGSL_PROP_PWRCTRL         = 0x0000000E,
-	KGSL_PROP_FAULT_TOLERANCE = 0x00000011,
-};
-
-/* Fault Tolerance policy flags */
-#define  KGSL_FT_DISABLE                  0x00000001
-#define  KGSL_FT_REPLAY                   0x00000002
-#define  KGSL_FT_SKIPIB                   0x00000004
-#define  KGSL_FT_SKIPFRAME                0x00000008
-#define  KGSL_FT_DEFAULT_POLICY           (KGSL_FT_REPLAY + KGSL_FT_SKIPIB)
-
-/* Pagefault policy flags */
-#define KGSL_FT_PAGEFAULT_INT_ENABLE         0x00000001
-#define KGSL_FT_PAGEFAULT_GPUHALT_ENABLE     0x00000002
-#define KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE   0x00000004
-#define KGSL_FT_PAGEFAULT_LOG_ONE_PER_INT    0x00000008
-#define KGSL_FT_PAGEFAULT_DEFAULT_POLICY     (KGSL_FT_PAGEFAULT_INT_ENABLE + \
-					KGSL_FT_PAGEFAULT_LOG_ONE_PER_PAGE)
-
-/* Fault tolerance config */
-struct kgsl_ft_config {
-	unsigned int ft_policy;    /* Fault Tolerance policy flags */
-	unsigned int ft_pf_policy; /* Pagefault policy flags */
-	unsigned int ft_pm_dump;   /* KGSL enable postmortem dump */
-	unsigned int ft_detect_ms;
-	unsigned int ft_dos_timeout_ms;
 };
 
 struct kgsl_shadowprop {
@@ -234,6 +211,26 @@ struct kgsl_version {
 	unsigned int dev_minor;
 };
 
+/* Performance counter groups */
+
+#define KGSL_PERFCOUNTER_GROUP_CP 0x0
+#define KGSL_PERFCOUNTER_GROUP_RBBM 0x1
+#define KGSL_PERFCOUNTER_GROUP_PC 0x2
+#define KGSL_PERFCOUNTER_GROUP_VFD 0x3
+#define KGSL_PERFCOUNTER_GROUP_HLSQ 0x4
+#define KGSL_PERFCOUNTER_GROUP_VPC 0x5
+#define KGSL_PERFCOUNTER_GROUP_TSE 0x6
+#define KGSL_PERFCOUNTER_GROUP_RAS 0x7
+#define KGSL_PERFCOUNTER_GROUP_UCHE 0x8
+#define KGSL_PERFCOUNTER_GROUP_TP 0x9
+#define KGSL_PERFCOUNTER_GROUP_SP 0xA
+#define KGSL_PERFCOUNTER_GROUP_RB 0xB
+#define KGSL_PERFCOUNTER_GROUP_PWR 0xC
+#define KGSL_PERFCOUNTER_GROUP_VBIF 0xD
+#define KGSL_PERFCOUNTER_GROUP_VBIF_PWR 0xE
+
+#define KGSL_PERFCOUNTER_NOT_USED 0xFFFFFFFF
+
 /* structure holds list of ibs */
 struct kgsl_ibdesc {
 	unsigned int gpuaddr;
@@ -287,7 +284,7 @@ struct kgsl_device_waittimestamp_ctxtid {
 #define IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID \
 	_IOW(KGSL_IOC_TYPE, 0x7, struct kgsl_device_waittimestamp_ctxtid)
 
-/* issue indirect commands to the GPU.
+/* DEPRECATED: issue indirect commands to the GPU.
  * drawctxt_id must have been created with IOCTL_KGSL_DRAWCTXT_CREATE
  * ibaddr and sizedwords must specify a subset of a buffer created
  * with IOCTL_KGSL_SHAREDMEM_FROM_PMEM
@@ -295,6 +292,9 @@ struct kgsl_device_waittimestamp_ctxtid {
  * timestamp is a returned counter value which can be passed to
  * other ioctls to determine when the commands have been executed by
  * the GPU.
+ *
+ * This fucntion is deprecated - consider using IOCTL_KGSL_SUBMIT_COMMANDS
+ * instead
  */
 struct kgsl_ringbuffer_issueibcmds {
 	unsigned int drawctxt_id;
@@ -684,6 +684,202 @@ struct kgsl_gpumem_sync_cache {
 #define IOCTL_KGSL_GPUMEM_SYNC_CACHE \
 	_IOW(KGSL_IOC_TYPE, 0x37, struct kgsl_gpumem_sync_cache)
 
+/**
+ * struct kgsl_perfcounter_get - argument to IOCTL_KGSL_PERFCOUNTER_GET
+ * @groupid: Performance counter group ID
+ * @countable: Countable to select within the group
+ * @offset: Return offset of the reserved counter
+ *
+ * Get an available performance counter from a specified groupid.  The offset
+ * of the performance counter will be returned after successfully assigning
+ * the countable to the counter for the specified group.  An error will be
+ * returned and an offset of 0 if the groupid is invalid or there are no
+ * more counters left.  After successfully getting a perfcounter, the user
+ * must call kgsl_perfcounter_put(groupid, contable) when finished with
+ * the perfcounter to clear up perfcounter resources.
+ *
+ */
+struct kgsl_perfcounter_get {
+	unsigned int groupid;
+	unsigned int countable;
+	unsigned int offset;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_GET \
+	_IOWR(KGSL_IOC_TYPE, 0x38, struct kgsl_perfcounter_get)
+
+/**
+ * struct kgsl_perfcounter_put - argument to IOCTL_KGSL_PERFCOUNTER_PUT
+ * @groupid: Performance counter group ID
+ * @countable: Countable to release within the group
+ *
+ * Put an allocated performance counter to allow others to have access to the
+ * resource that was previously taken.  This is only to be called after
+ * successfully getting a performance counter from kgsl_perfcounter_get().
+ *
+ */
+struct kgsl_perfcounter_put {
+	unsigned int groupid;
+	unsigned int countable;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_PUT \
+	_IOW(KGSL_IOC_TYPE, 0x39, struct kgsl_perfcounter_put)
+
+/**
+ * struct kgsl_perfcounter_query - argument to IOCTL_KGSL_PERFCOUNTER_QUERY
+ * @groupid: Performance counter group ID
+ * @countable: Return active countables array
+ * @size: Size of active countables array
+ * @max_counters: Return total number counters for the group ID
+ *
+ * Query the available performance counters given a groupid.  The array
+ * *countables is used to return the current active countables in counters.
+ * The size of the array is passed in so the kernel will only write at most
+ * size or counter->size for the group id.  The total number of available
+ * counters for the group ID is returned in max_counters.
+ * If the array or size passed in are invalid, then only the maximum number
+ * of counters will be returned, no data will be written to *countables.
+ * If the groupid is invalid an error code will be returned.
+ *
+ */
+struct kgsl_perfcounter_query {
+	unsigned int groupid;
+	/* Array to return the current countable for up to size counters */
+	unsigned int *countables;
+	unsigned int count;
+	unsigned int max_counters;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_QUERY \
+	_IOWR(KGSL_IOC_TYPE, 0x3A, struct kgsl_perfcounter_query)
+
+/**
+ * struct kgsl_perfcounter_query - argument to IOCTL_KGSL_PERFCOUNTER_QUERY
+ * @groupid: Performance counter group IDs
+ * @countable: Performance counter countable IDs
+ * @value: Return performance counter reads
+ * @size: Size of all arrays (groupid/countable pair and return value)
+ *
+ * Read in the current value of a performance counter given by the groupid
+ * and countable.
+ *
+ */
+
+struct kgsl_perfcounter_read_group {
+	unsigned int groupid;
+	unsigned int countable;
+	unsigned long long value;
+};
+
+struct kgsl_perfcounter_read {
+	struct kgsl_perfcounter_read_group *reads;
+	unsigned int count;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_PERFCOUNTER_READ \
+	_IOWR(KGSL_IOC_TYPE, 0x3B, struct kgsl_perfcounter_read)
+/*
+ * struct kgsl_gpumem_sync_cache_bulk - argument to
+ * IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK
+ * @id_list: list of GPU buffer ids of the buffers to sync
+ * @count: number of GPU buffer ids in id_list
+ * @op: a mask of KGSL_GPUMEM_CACHE_* values
+ *
+ * Sync the cache for memory headed to and from the GPU. Certain
+ * optimizations can be made on the cache operation based on the total
+ * size of the working set of memory to be managed.
+ */
+struct kgsl_gpumem_sync_cache_bulk {
+	unsigned int *id_list;
+	unsigned int count;
+	unsigned int op;
+/* private: reserved for future use */
+	unsigned int __pad[2]; /* For future binary compatibility */
+};
+
+#define IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK \
+	_IOWR(KGSL_IOC_TYPE, 0x3C, struct kgsl_gpumem_sync_cache_bulk)
+
+/*
+ * struct kgsl_cmd_syncpoint_timestamp
+ * @context_id: ID of a KGSL context
+ * @timestamp: GPU timestamp
+ *
+ * This structure defines a syncpoint comprising a context/timestamp pair. A
+ * list of these may be passed by IOCTL_KGSL_SUBMIT_COMMANDS to define
+ * dependencies that must be met before the command can be submitted to the
+ * hardware
+ */
+struct kgsl_cmd_syncpoint_timestamp {
+	unsigned int context_id;
+	unsigned int timestamp;
+};
+
+#define KGSL_CMD_SYNCPOINT_TYPE_TIMESTAMP 0
+
+struct kgsl_cmd_syncpoint_fence {
+	int fd;
+};
+
+#define KGSL_CMD_SYNCPOINT_TYPE_FENCE 1
+
+/**
+ * struct kgsl_cmd_syncpoint - Define a sync point for a command batch
+ * @type: type of sync point defined here
+ * @priv: Pointer to the type specific buffer
+ * @size: Size of the type specific buffer
+ *
+ * This structure contains pointers defining a specific command sync point.
+ * The pointer and size should point to a type appropriate structure.
+ */
+struct kgsl_cmd_syncpoint {
+	int type;
+	void __user *priv;
+	unsigned int size;
+};
+
+/**
+ * struct kgsl_submit_commands - Argument to IOCTL_KGSL_SUBMIT_COMMANDS
+ * @context_id: KGSL context ID that owns the commands
+ * @flags:
+ * @cmdlist: User pointer to a list of kgsl_ibdesc structures
+ * @numcmds: Number of commands listed in cmdlist
+ * @synclist: User pointer to a list of kgsl_cmd_syncpoint structures
+ * @numsyncs: Number of sync points listed in synclist
+ * @timestamp: On entry the a user defined timestamp, on exist the timestamp
+ * assigned to the command batch
+ *
+ * This structure specifies a command to send to the GPU hardware.  This is
+ * similar to kgsl_issueibcmds expect that it doesn't support the legacy way to
+ * submit IB lists and it adds sync points to block the IB until the
+ * dependencies are satisified.  This entry point is the new and preferred way
+ * to submit commands to the GPU.
+ */
+
+struct kgsl_submit_commands {
+	unsigned int context_id;
+	unsigned int flags;
+	struct kgsl_ibdesc __user *cmdlist;
+	unsigned int numcmds;
+	struct kgsl_cmd_syncpoint __user *synclist;
+	unsigned int numsyncs;
+	unsigned int timestamp;
+/* private: reserved for future use */
+	unsigned int __pad[4];
+};
+
+#define IOCTL_KGSL_SUBMIT_COMMANDS \
+	_IOWR(KGSL_IOC_TYPE, 0x3D, struct kgsl_submit_commands)
+
 #ifdef __KERNEL__
 #ifdef CONFIG_MSM_KGSL_DRM
 int kgsl_gem_obj_addr(int drm_fd, int handle, unsigned long *start,