diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef19399a20d3..51453db4fb9f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1378,6 +1378,8 @@ if(ARMV7)
 	set(GPU_NEON GPU/Common/TextureDecoderNEON.cpp)
 endif()
 add_library(GPU OBJECT
+	GPU/Common/DepalettizeShaderCommon.cpp
+	GPU/Common/DepalettizeShaderCommon.h
 	GPU/Common/FramebufferCommon.cpp
 	GPU/Common/FramebufferCommon.h
 	GPU/Common/GPUDebugInterface.h
diff --git a/GPU/Common/DepalettizeShaderCommon.cpp b/GPU/Common/DepalettizeShaderCommon.cpp
new file mode 100644
index 000000000000..48e1a375fa2f
--- /dev/null
+++ b/GPU/Common/DepalettizeShaderCommon.cpp
@@ -0,0 +1,259 @@
+// Copyright (c) 2014- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include <stdio.h>
+
+#include "Common/Log.h"
+#include "Core/Reporting.h"
+#include "GPU/GPUState.h"
+#include "GPU/Common/DepalettizeShaderCommon.h"
+
+
+#define WRITE p+=sprintf
+
+// Uses integer instructions available since OpenGL 3.0. Suitable for ES 3.0 as well.
+void GenerateDepalShader300(char *buffer, GEBufferFormat pixelFormat) {
+	char *p = buffer;
+#ifdef USING_GLES2
+	WRITE(p, "#version 300 es\n");
+	WRITE(p, "precision mediump float;\n");
+#else
+	WRITE(p, "#version 330\n");
+#endif
+	WRITE(p, "in vec2 v_texcoord0;\n");
+	WRITE(p, "out vec4 fragColor0;\n");
+	WRITE(p, "uniform sampler2D tex;\n");
+	WRITE(p, "uniform sampler2D pal;\n");
+
+	WRITE(p, "void main() {\n");
+	WRITE(p, "  vec4 color = texture(tex, v_texcoord0);\n");
+
+	int mask = gstate.getClutIndexMask();
+	int shift = gstate.getClutIndexShift();
+	int offset = gstate.getClutIndexStartPos();
+	const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
+	// Unfortunately sampling turned our texture into floating point. To avoid this, might be able
+	// to declare them as isampler2D objects, but these require integer textures, which needs more work.
+	// Anyhow, we simply work around this by converting back to integer. Hopefully there will be no loss of precision.
+	// Use the mask to skip reading some components.
+	int shiftedMask = mask << shift;
+	switch (pixelFormat) {
+	case GE_FORMAT_8888:
+		if (shiftedMask & 0xFF) WRITE(p, "  int r = int(color.r * 255.99);\n"); else WRITE(p, "  int r = 0;\n");
+		if (shiftedMask & 0xFF00) WRITE(p, "  int g = int(color.g * 255.99);\n"); else WRITE(p, "  int g = 0;\n");
+		if (shiftedMask & 0xFF0000) WRITE(p, "  int b = int(color.b * 255.99);\n"); else WRITE(p, "  int b = 0;\n");
+		if (shiftedMask & 0xFF000000) WRITE(p, "  int a = int(color.a * 255.99);\n"); else WRITE(p, "  int a = 0;\n");
+		WRITE(p, "  int index = (a << 24) | (b << 16) | (g << 8) | (r);\n");
+		break;
+	case GE_FORMAT_4444:
+		if (shiftedMask & 0xF) WRITE(p, "  int r = int(color.r * 15.99);\n"); else WRITE(p, "  int r = 0;\n");
+		if (shiftedMask & 0xF0) WRITE(p, "  int g = int(color.g * 15.99);\n"); else WRITE(p, "  int g = 0;\n");
+		if (shiftedMask & 0xF00) WRITE(p, "  int b = int(color.b * 15.99);\n"); else WRITE(p, "  int b = 0;\n");
+		if (shiftedMask & 0xF000) WRITE(p, "  int a = int(color.a * 15.99);\n"); else WRITE(p, "  int a = 0;\n");
+		WRITE(p, "  int index = (a << 12) | (b << 8) | (g << 4) | (r);\n");
+		break;
+	case GE_FORMAT_565:
+		if (shiftedMask & 0x1F) WRITE(p, "  int r = int(color.r * 31.99);\n"); else WRITE(p, "  int r = 0;\n");
+		if (shiftedMask & 0x7E0) WRITE(p, "  int g = int(color.g * 63.99);\n"); else WRITE(p, "  int g = 0;\n");
+		if (shiftedMask & 0xF800) WRITE(p, "  int b = int(color.b * 31.99);\n"); else WRITE(p, "  int b = 0;\n");
+		WRITE(p, "  int index = (b << 11) | (g << 5) | (r);\n");
+		break;
+	case GE_FORMAT_5551:
+		if (shiftedMask & 0x1F) WRITE(p, "  int r = int(color.r * 31.99);\n"); else WRITE(p, "  int r = 0;\n");
+		if (shiftedMask & 0x3E0) WRITE(p, "  int g = int(color.g * 31.99);\n"); else WRITE(p, "  int g = 0;\n");
+		if (shiftedMask & 0x7C00) WRITE(p, "  int b = int(color.b * 31.99);\n"); else WRITE(p, "  int b = 0;\n");
+		if (shiftedMask & 0x8000) WRITE(p, "  int a = int(color.a);\n"); else WRITE(p, "  int a = 0;\n");
+		WRITE(p, "  int index = (a << 15) | (b << 10) | (g << 5) | (r);\n");
+		break;
+	default:
+		break;
+	}
+
+	float texturePixels = 256;
+	if (clutFormat != GE_CMODE_32BIT_ABGR8888)
+		texturePixels = 512;
+
+	if (shift) {
+		WRITE(p, "  index = ((index >> %i) & 0x%02x)", shift, mask);
+	} else {
+		WRITE(p, "  index = (index & 0x%02x)", mask);
+	}
+	if (offset) {
+		WRITE(p, " | %i;\n", offset);  // '|' matches what we have in gstate.h
+	} else {
+		WRITE(p, ";\n");
+	}
+
+	WRITE(p, "  fragColor0 = texture(pal, vec2((float(index) + 0.5) * (1.0 / %f), 0.0));\n", texturePixels);
+	WRITE(p, "}\n");
+}
+
+// FP only, to suit GL(ES) 2.0
+void GenerateDepalShaderFloat(char *buffer, GEBufferFormat pixelFormat, ShaderLanguage lang) {
+	char *p = buffer;
+
+	const char *modFunc = lang == HLSL_DX9 ? "fmod" : "mod";
+
+	char lookupMethod[128] = "index.r";
+	char offset[128] = "";
+
+	const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
+	const u32 clutBase = gstate.getClutIndexStartPos();
+
+	const int shift = gstate.getClutIndexShift();
+	const int mask = gstate.getClutIndexMask();
+
+	float index_multiplier = 1.0f;
+	// pixelformat is the format of the texture we are sampling.
+	bool formatOK = true;
+	switch (pixelFormat) {
+	case GE_FORMAT_8888:
+		if ((mask & (mask + 1)) == 0) {
+			// If the value has all bits contiguous (bitmask check above), we can mod by it + 1.
+			const char *rgba = "rrrrrrrrggggggggbbbbbbbbaaaaaaaa";
+			const u8 rgba_shift = shift & 7;
+			if (rgba_shift == 0 && mask == 0xFF) {
+				sprintf(lookupMethod, "index.%c", rgba[shift]);
+			} else {
+				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], 255.99f / (1 << rgba_shift), mask + 1);
+				index_multiplier = 1.0f / 256.0f;
+				// Format was OK if there weren't bits from another component.
+				formatOK = mask <= 255 - (1 << rgba_shift);
+			}
+		} else {
+			formatOK = false;
+		}
+		break;
+	case GE_FORMAT_4444:
+		if ((mask & (mask + 1)) == 0 && shift < 16) {
+			const char *rgba = "rrrrggggbbbbaaaa";
+			const u8 rgba_shift = shift & 3;
+			if (rgba_shift == 0 && mask == 0xF) {
+				sprintf(lookupMethod, "index.%c", rgba[shift]);
+				index_multiplier = 15.0f / 256.0f;
+			} else {
+				// Let's divide and mod to get the right bits.  A common case is shift=0, mask=01.
+				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], 15.99f / (1 << rgba_shift), mask + 1);
+				index_multiplier = 1.0f / 256.0f;
+				formatOK = mask <= 15 - (1 << rgba_shift);
+			}
+		} else {
+			formatOK = false;
+		}
+		break;
+	case GE_FORMAT_565:
+		if ((mask & (mask + 1)) == 0 && shift < 16) {
+			const u8 shifts[16] = { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4 };
+			const int multipliers[16] = { 31, 31, 31, 31, 31, 63, 63, 63, 63, 63, 63, 31, 31, 31, 31, 31 };
+			const char *rgba = "rrrrrggggggbbbbb";
+			const u8 rgba_shift = shifts[shift];
+			if (rgba_shift == 0 && mask == multipliers[shift]) {
+				sprintf(lookupMethod, "index.%c", rgba[shift]);
+				index_multiplier = multipliers[shift] / 256.0f;
+			} else {
+				// We just need to divide the right component by the right value, and then mod against the mask.
+				// A common case is shift=1, mask=0f.
+				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], ((float)multipliers[shift] + 0.99f) / (1 << rgba_shift), mask + 1);
+				index_multiplier = 1.0f / 256.0f;
+				formatOK = mask <= multipliers[shift] - (1 << rgba_shift);
+			}
+		} else {
+			formatOK = false;
+		}
+		break;
+	case GE_FORMAT_5551:
+		if ((mask & (mask + 1)) == 0 && shift < 16) {
+			const char *rgba = "rrrrrgggggbbbbba";
+			const u8 rgba_shift = shift % 5;
+			if (rgba_shift == 0 && mask == 0x1F) {
+				sprintf(lookupMethod, "index.%c", rgba[shift]);
+				index_multiplier = 31.0f / 256.0f;
+			} else if (shift == 15 && mask == 1) {
+				sprintf(lookupMethod, "index.%c", rgba[shift]);
+				index_multiplier = 1.0f / 256.0f;
+			} else {
+				// A isn't possible here.
+				sprintf(lookupMethod, "%s(index.%c * %f, %d.0)", modFunc, rgba[shift], 31.99f / (1 << rgba_shift), mask + 1);
+				index_multiplier = 1.0f / 256.0f;
+				formatOK = mask <= 31 - (1 << rgba_shift);
+			}
+		} else {
+			formatOK = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	float texturePixels = 256.f;
+	if (clutFormat != GE_CMODE_32BIT_ABGR8888) {
+		texturePixels = 512.f;
+		index_multiplier *= 0.5f;
+	}
+
+	// Adjust index_multiplier, similar to the use of 15.99 instead of 16 in the ES 3 path.
+	// index_multiplier -= 0.01f / texturePixels;
+
+	if (!formatOK) {
+		ERROR_LOG_REPORT_ONCE(depal, G3D, "%i depal unsupported: shift=%i mask=%02x offset=%d", pixelFormat, shift, mask, clutBase);
+	}
+
+	// Offset by half a texel (plus clutBase) to turn NEAREST filtering into FLOOR.
+	float texel_offset = ((float)clutBase + 0.5f) / texturePixels;
+	sprintf(offset, " + %f", texel_offset);
+
+	if (lang == GLSL_140) {
+#ifdef USING_GLES2
+		WRITE(p, "#version 100\n");
+		WRITE(p, "precision mediump float;\n");
+#else
+		WRITE(p, "#version 110\n");
+#endif
+		WRITE(p, "varying vec2 v_texcoord0;\n");
+		WRITE(p, "uniform sampler2D tex;\n");
+		WRITE(p, "uniform sampler2D pal;\n");
+		WRITE(p, "void main() {\n");
+		WRITE(p, "  vec4 index = texture2D(tex, v_texcoord0);\n");
+		WRITE(p, "  float coord = (%s * %f)%s;\n", lookupMethod, index_multiplier, offset);
+		WRITE(p, "  gl_FragColor = texture2D(pal, vec2(coord, 0.0));\n");
+		WRITE(p, "}\n");
+	} else if (lang == HLSL_DX9) {
+		WRITE(p, "sampler tex: register(s0);\n");
+		WRITE(p, "sampler pal: register(s1);\n");
+		WRITE(p, "float4 main(float2 v_texcoord0 : TEXCOORD0) : COLOR0 {\n");
+		WRITE(p, "  float4 index = tex2D(tex, v_texcoord0);\n");
+		WRITE(p, "  float coord = (%s * %f)%s;\n", lookupMethod, index_multiplier, offset);
+		WRITE(p, "  return tex2D(pal, float2(coord, 0.0)).bgra;\n");
+		WRITE(p, "}\n");
+	}
+}
+
+void GenerateDepalShader(char *buffer, GEBufferFormat pixelFormat, ShaderLanguage language) {
+	switch (language) {
+	case GLSL_140:
+		GenerateDepalShaderFloat(buffer, pixelFormat, language);
+		break;
+	case GLSL_300:
+		GenerateDepalShader300(buffer, pixelFormat);
+		break;
+	case HLSL_DX9:
+		GenerateDepalShaderFloat(buffer, pixelFormat, language);
+		break;
+	}
+}
+
+#undef WRITE
\ No newline at end of file
diff --git a/GPU/Common/DepalettizeShaderCommon.h b/GPU/Common/DepalettizeShaderCommon.h
new file mode 100644
index 000000000000..eccc53f7ffbb
--- /dev/null
+++ b/GPU/Common/DepalettizeShaderCommon.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2014- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#pragma once
+
+#include "GPU/ge_constants.h"
+
+enum ShaderLanguage {
+	GLSL_140,
+	GLSL_300,
+	HLSL_DX9,
+};
+
+void GenerateDepalShader(char *buffer, GEBufferFormat pixelFormat, ShaderLanguage language);
diff --git a/GPU/Directx9/DepalettizeShaderDX9.cpp b/GPU/Directx9/DepalettizeShaderDX9.cpp
new file mode 100644
index 000000000000..4511391fef09
--- /dev/null
+++ b/GPU/Directx9/DepalettizeShaderDX9.cpp
@@ -0,0 +1,175 @@
+// Copyright (c) 2014- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include <map>
+
+#include "base/logging.h"
+#include "Common/Log.h"
+#include "Core/Reporting.h"
+#include "GPU/GPUState.h"
+#include "GPU/Directx9/TextureCacheDX9.h"
+#include "GPU/Directx9/DepalettizeShaderDX9.h"
+#include "GPU/Common/DepalettizeShaderCommon.h"
+#include "GPU/Directx9/helper/global.h"
+
+namespace DX9 {
+
+static const int DEPAL_TEXTURE_OLD_AGE = 120;
+
+#ifdef _WIN32
+#define SHADERLOG
+#endif
+
+static const char *depalVShaderHLSL =
+"struct VS_IN {\n"
+"  float3 a_position : POSITION;\n"
+"  float2 a_texcoord0 : TEXCOORD0;\n"
+"};\n"
+"struct VS_OUT {\n"
+"  float4 Position : POSITION;\n"
+"  float2 Texcoord : TEXCOORD0;\n"
+"};\n"
+"VS_OUT main(VS_IN input) {\n"
+"  VS_OUT output;\n"
+"  output.Texcoord = input.a_texcoord0;\n"
+"  output.Position = float4(input.a_position, 1.0);\n"
+"  return output;\n"
+"}\n";
+
+DepalShaderCacheDX9::DepalShaderCacheDX9() : vertexShader_(nullptr) {
+	std::string errorMessage;
+	if (!DX9::CompileVertexShader(depalVShaderHLSL, &vertexShader_, nullptr, errorMessage)) {
+		ERROR_LOG(G3D, "error compling depal vshader: %s", errorMessage.c_str());
+	}
+}
+
+DepalShaderCacheDX9::~DepalShaderCacheDX9() {
+	Clear();
+	if (vertexShader_) {
+		vertexShader_->Release();
+	}
+}
+
+u32 DepalShaderCacheDX9::GenerateShaderID(GEBufferFormat pixelFormat) {
+	return (gstate.clutformat & 0xFFFFFF) | (pixelFormat << 24);
+}
+
+LPDIRECT3DTEXTURE9 DepalShaderCacheDX9::GetClutTexture(const u32 clutID, u32 *rawClut) {
+	GEPaletteFormat palFormat = gstate.getClutPaletteFormat();
+	const u32 realClutID = clutID ^ palFormat;
+
+	auto oldtex = texCache_.find(realClutID);
+	if (oldtex != texCache_.end()) {
+		oldtex->second->lastFrame = gpuStats.numFlips;
+		return oldtex->second->texture;
+	}
+
+	D3DFORMAT dstFmt = DX9::getClutDestFormat(palFormat);
+	int texturePixels = palFormat == GE_CMODE_32BIT_ABGR8888 ? 256 : 512;
+
+	DepalTextureDX9 *tex = new DepalTextureDX9();
+
+	// Create texture
+	D3DPOOL pool = D3DPOOL_MANAGED;
+	int usage = 0;
+	if (pD3DdeviceEx) {
+		pool = D3DPOOL_DEFAULT;
+		usage = D3DUSAGE_DYNAMIC;  // TODO: Switch to using a staging texture?
+	}
+
+	HRESULT hr = pD3Ddevice->CreateTexture(texturePixels, 1, 1, usage, (D3DFORMAT)D3DFMT(dstFmt), pool, &tex->texture, NULL);
+	if (FAILED(hr)) {
+		ERROR_LOG(G3D, "Failed to create D3D texture for depal");
+		return nullptr;
+	}
+
+	D3DLOCKED_RECT rect;
+	hr = tex->texture->LockRect(0, &rect, NULL, 0);
+	if (FAILED(hr)) {
+		ERROR_LOG(G3D, "Failed to lock D3D texture for depal");
+		return nullptr;
+	}
+	// Regardless of format, the CLUT should always be 1024 bytes.
+	memcpy(rect.pBits, rawClut, 1024);
+	tex->texture->UnlockRect(0);
+
+	pD3Ddevice->SetSamplerState(1, D3DSAMP_ADDRESSU, D3DTADDRESS_CLAMP);
+	pD3Ddevice->SetSamplerState(1, D3DSAMP_ADDRESSV, D3DTADDRESS_CLAMP);
+	pD3Ddevice->SetSamplerState(1, D3DSAMP_MINFILTER, D3DTEXF_POINT);
+	pD3Ddevice->SetSamplerState(1, D3DSAMP_MAGFILTER, D3DTEXF_POINT);
+
+	tex->lastFrame = gpuStats.numFlips;
+	texCache_[realClutID] = tex;
+	return tex->texture;
+}
+
+void DepalShaderCacheDX9::Clear() {
+	for (auto shader = cache_.begin(); shader != cache_.end(); ++shader) {
+		shader->second->pixelShader->Release();
+		delete shader->second;
+	}
+	cache_.clear();
+	for (auto tex = texCache_.begin(); tex != texCache_.end(); ++tex) {
+		tex->second->texture->Release();
+		delete tex->second;
+	}
+	texCache_.clear();
+}
+
+void DepalShaderCacheDX9::Decimate() {
+	for (auto tex = texCache_.begin(); tex != texCache_.end();) {
+		if (tex->second->lastFrame + DEPAL_TEXTURE_OLD_AGE < gpuStats.numFlips) {
+			tex->second->texture->Release();
+			delete tex->second;
+			texCache_.erase(tex++);
+		} else {
+			++tex;
+		}
+	}
+}
+
+LPDIRECT3DPIXELSHADER9 DepalShaderCacheDX9::GetDepalettizePixelShader(GEBufferFormat pixelFormat) {
+	u32 id = GenerateShaderID(pixelFormat);
+
+	auto shader = cache_.find(id);
+	if (shader != cache_.end()) {
+		return shader->second->pixelShader;
+	}
+
+	char *buffer = new char[2048];
+
+	GenerateDepalShader(buffer, pixelFormat, HLSL_DX9);
+
+	LPDIRECT3DPIXELSHADER9 pshader;
+	std::string errorMessage;
+	if (!CompilePixelShader(buffer, &pshader, NULL, errorMessage)) {
+		ERROR_LOG(G3D, "Failed to compile depal pixel shader: %s\n\n%s", buffer, errorMessage.c_str());
+		delete[] buffer;
+		return nullptr;
+	}
+
+	DepalShaderDX9 *depal = new DepalShaderDX9();
+	depal->pixelShader = pshader;
+
+	cache_[id] = depal;
+
+	delete[] buffer;
+
+	return depal->pixelShader;
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/GPU/Directx9/DepalettizeShaderDX9.h b/GPU/Directx9/DepalettizeShaderDX9.h
new file mode 100644
index 000000000000..55a4ae79113a
--- /dev/null
+++ b/GPU/Directx9/DepalettizeShaderDX9.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2014- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include <map>
+
+#include "Common/CommonTypes.h"
+#include "GPU/ge_constants.h"
+
+#include "GPU/Directx9/helper/global.h"
+
+namespace DX9 {
+
+class DepalShaderDX9 {
+public:
+	LPDIRECT3DPIXELSHADER9 pixelShader;
+};
+
+class DepalTextureDX9 {
+public:
+	LPDIRECT3DTEXTURE9 texture;
+	int lastFrame;
+};
+
+// Caches both shaders and palette textures.
+class DepalShaderCacheDX9 {
+public:
+	DepalShaderCacheDX9();
+	~DepalShaderCacheDX9();
+
+	// This also uploads the palette and binds the correct texture.
+	LPDIRECT3DPIXELSHADER9 GetDepalettizePixelShader(GEBufferFormat pixelFormat);
+	LPDIRECT3DVERTEXSHADER9 GetDepalettizeVertexShader() { return vertexShader_; }
+	LPDIRECT3DTEXTURE9 GetClutTexture(const u32 clutHash, u32 *rawClut);
+	void Clear();
+	void Decimate();
+
+private:
+	u32 GenerateShaderID(GEBufferFormat pixelFormat);
+
+	LPDIRECT3DVERTEXSHADER9 vertexShader_;
+	std::map<u32, DepalShaderDX9 *> cache_;
+	std::map<u32, DepalTextureDX9 *> texCache_;
+};
+
+}  // namespace
\ No newline at end of file
diff --git a/GPU/Directx9/FramebufferDX9.cpp b/GPU/Directx9/FramebufferDX9.cpp
index 990ceb829554..5d1bf76adba0 100644
--- a/GPU/Directx9/FramebufferDX9.cpp
+++ b/GPU/Directx9/FramebufferDX9.cpp
@@ -63,33 +63,33 @@ namespace DX9 {
 	void CenterRect(float *x, float *y, float *w, float *h,
                 float origW, float origH, float frameW, float frameH) {
 		if (g_Config.bStretchToDisplay) {
-				*x = 0;
-				*y = 0;
-				*w = frameW;
-				*h = frameH;
-				return;
+			*x = 0;
+			*y = 0;
+			*w = frameW;
+			*h = frameH;
+			return;
 		}
 
 		float origRatio = origW/origH;
 		float frameRatio = frameW/frameH;
 
 		if (origRatio > frameRatio) {
-				// Image is wider than frame. Center vertically.
-				float scale = origW / frameW;
-				*x = 0.0f;
-				*w = frameW;
-				*h = frameW / origRatio;
-				// Stretch a little bit
-				if (g_Config.bPartialStretch)
-						*h = (frameH + *h) / 2.0f; // (408 + 720) / 2 = 564
-				*y = (frameH - *h) / 2.0f;
+			// Image is wider than frame. Center vertically.
+			float scale = origW / frameW;
+			*x = 0.0f;
+			*w = frameW;
+			*h = frameW / origRatio;
+			// Stretch a little bit
+			if (g_Config.bPartialStretch)
+				*h = (frameH + *h) / 2.0f; // (408 + 720) / 2 = 564
+			*y = (frameH - *h) / 2.0f;
 		} else {
-				// Image is taller than frame. Center horizontally.
-				float scale = origH / frameH;
-				*y = 0.0f;
-				*h = frameH;
-				*w = frameH * origRatio;
-				*x = (frameW - *w) / 2.0f;
+			// Image is taller than frame. Center horizontally.
+			float scale = origH / frameH;
+			*y = 0.0f;
+			*h = frameH;
+			*w = frameH * origRatio;
+			*x = (frameW - *w) / 2.0f;
 		}
 	}
 
@@ -202,8 +202,6 @@ namespace DX9 {
 		convBuf = (u8*)rect.pBits;
 
 		// Final format is BGRA(directx)
-
-		// TODO: We can just change the texture format and flip some bits around instead of this.
 		if (srcPixelFormat != GE_FORMAT_8888 || srcStride != 512) {
 			for (int y = 0; y < height; y++) {
 				switch (srcPixelFormat) {
diff --git a/GPU/Directx9/FramebufferDX9.h b/GPU/Directx9/FramebufferDX9.h
index 888f4af97d14..c27a14987d15 100644
--- a/GPU/Directx9/FramebufferDX9.h
+++ b/GPU/Directx9/FramebufferDX9.h
@@ -73,7 +73,7 @@ class FramebufferManagerDX9 : public FramebufferManagerCommon {
 
 	void BlitFramebufferDepth(VirtualFramebuffer *src, VirtualFramebuffer *dst);
 
-	void BindFramebufferColor(int stage, VirtualFramebuffer *framebuffer, bool skipCopy);
+	void BindFramebufferColor(int stage, VirtualFramebuffer *framebuffer, bool skipCopy = false);
 
 	virtual void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h) override;
 
diff --git a/GPU/Directx9/GPU_DX9.cpp b/GPU/Directx9/GPU_DX9.cpp
index 597e625b31fe..04e1391d4a9b 100644
--- a/GPU/Directx9/GPU_DX9.cpp
+++ b/GPU/Directx9/GPU_DX9.cpp
@@ -403,6 +403,7 @@ DIRECTX9_GPU::DIRECTX9_GPU()
 	framebufferManager_.SetShaderManager(shaderManager_);
 	framebufferManager_.SetTransformDrawEngine(&transformDraw_);
 	textureCache_.SetFramebufferManager(&framebufferManager_);
+	textureCache_.SetDepalShaderCache(&depalShaderCache_);
 	textureCache_.SetShaderManager(shaderManager_);
 
 	// Sanity check gstate
@@ -525,7 +526,7 @@ void DIRECTX9_GPU::BeginFrameInternal() {
 
 	textureCache_.StartFrame();
 	transformDraw_.DecimateTrackedVertexArrays();
-	// depalShaderCache_.Decimate();
+	depalShaderCache_.Decimate();
 	// fragmentTestCache_.Decimate();
 
 	if (dumpNextFrame_) {
diff --git a/GPU/Directx9/GPU_DX9.h b/GPU/Directx9/GPU_DX9.h
index 37527f0d8d55..8b093d310ad1 100644
--- a/GPU/Directx9/GPU_DX9.h
+++ b/GPU/Directx9/GPU_DX9.h
@@ -24,6 +24,7 @@
 #include "GPU/Directx9/FramebufferDX9.h"
 #include "GPU/Directx9/TransformPipelineDX9.h"
 #include "GPU/Directx9/TextureCacheDX9.h"
+#include "GPU/Directx9/DepalettizeShaderDX9.h"
 #include "GPU/Directx9/helper/fbo.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 
@@ -167,6 +168,7 @@ class DIRECTX9_GPU : public GPUCommon
 
 	FramebufferManagerDX9 framebufferManager_;
 	TextureCacheDX9 textureCache_;
+	DepalShaderCacheDX9 depalShaderCache_;
 	TransformDrawEngineDX9 transformDraw_;
 	ShaderManagerDX9 *shaderManager_;
 
diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp
index f45a60a15f34..7778b2956c13 100644
--- a/GPU/Directx9/TextureCacheDX9.cpp
+++ b/GPU/Directx9/TextureCacheDX9.cpp
@@ -26,6 +26,8 @@
 #include "GPU/Directx9/PixelShaderGeneratorDX9.h"
 #include "GPU/Directx9/TextureCacheDX9.h"
 #include "GPU/Directx9/FramebufferDX9.h"
+#include "GPU/Directx9/ShaderManagerDX9.h"
+#include "GPU/Directx9/DepalettizeShaderDX9.h"
 #include "GPU/Directx9/helper/dx_state.h"
 #include "GPU/Common/FramebufferCommon.h"
 #include "GPU/Common/TextureDecoder.h"
@@ -897,21 +899,84 @@ void TextureCacheDX9::SetTextureFramebuffer(TexCacheEntry *entry, VirtualFramebu
 	framebuffer->usageFlags |= FB_USAGE_TEXTURE;
 	bool useBufferedRendering = g_Config.iRenderingMode != FB_NON_BUFFERED_MODE;
 	if (useBufferedRendering) {
-		// TODO: Depal
-		// For now, let's not bind FBOs that we know are off (invalidHint will be -1.)
-		// But let's still not use random memory.
-		if (entry->framebuffer->fbo) {
-			fbo_bind_color_as_texture(entry->framebuffer->fbo, 0);
-			// Keep the framebuffer alive.
-			// TODO: Dangerous if it sets a new one?
-			entry->framebuffer->last_frame_used = gpuStats.numFlips;
+		LPDIRECT3DPIXELSHADER9 pshader = nullptr;
+		if ((entry->status & TexCacheEntry::STATUS_DEPALETTIZE) && !g_Config.bDisableSlowFramebufEffects) {
+			pshader = depalShaderCache_->GetDepalettizePixelShader(framebuffer->drawnFormat);
+		}
+
+		if (pshader) {
+			LPDIRECT3DTEXTURE9 clutTexture = depalShaderCache_->GetClutTexture(clutHash_, clutBuf_);
+
+			FBO *depalFBO = framebufferManager_->GetTempFBO(framebuffer->renderWidth, framebuffer->renderHeight, FBO_8888);
+			fbo_bind_as_render_target(depalFBO);
+
+			float xoff = -0.5f / framebuffer->renderWidth;
+			float yoff = 0.5f / framebuffer->renderHeight;
+
+			const float pos[12 + 8] = {
+				-1 + xoff, 1 + yoff, 0,    0, 0,
+				1 + xoff, 1 + yoff, 0,     1, 0,
+				1 + xoff, -1 + yoff, 0,    1, 1,
+				-1 + xoff, -1 + yoff, 0,   0, 1,
+			};
+
+			shaderManager_->DirtyLastShader();
+
+			pD3Ddevice->SetPixelShader(pshader);
+			pD3Ddevice->SetVertexShader(depalShaderCache_->GetDepalettizeVertexShader());
+			pD3Ddevice->SetVertexDeclaration(pFramebufferVertexDecl);
+			pD3Ddevice->SetTexture(1, clutTexture);
+			pD3Ddevice->SetSamplerState(1, D3DSAMP_MINFILTER, D3DTEXF_POINT);
+			pD3Ddevice->SetSamplerState(1, D3DSAMP_MAGFILTER, D3DTEXF_POINT);
+			pD3Ddevice->SetSamplerState(1, D3DSAMP_MIPFILTER, D3DTEXF_NONE);
+
+			framebufferManager_->BindFramebufferColor(0, framebuffer, true);
+			pD3Ddevice->SetSamplerState(0, D3DSAMP_MINFILTER, D3DTEXF_POINT);
+			pD3Ddevice->SetSamplerState(0, D3DSAMP_MAGFILTER, D3DTEXF_POINT);
+			pD3Ddevice->SetSamplerState(0, D3DSAMP_MIPFILTER, D3DTEXF_NONE);
+
+			pD3Ddevice->SetRenderState(D3DRS_ALPHABLENDENABLE, FALSE);
+			pD3Ddevice->SetRenderState(D3DRS_SEPARATEALPHABLENDENABLE, FALSE);
+			pD3Ddevice->SetRenderState(D3DRS_COLORWRITEENABLE, D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA);
+			pD3Ddevice->SetRenderState(D3DRS_ZENABLE, FALSE);
+			pD3Ddevice->SetRenderState(D3DRS_STENCILENABLE, FALSE);
+			pD3Ddevice->SetRenderState(D3DRS_SCISSORTESTENABLE, FALSE);
+			pD3Ddevice->SetRenderState(D3DRS_CULLMODE, D3DCULL_NONE);
+
+			D3DVIEWPORT9 vp;
+			vp.MinZ = 0;
+			vp.MaxZ = 1;
+			vp.X = 0;
+			vp.Y = 0;
+			vp.Width = framebuffer->renderWidth;
+			vp.Height = framebuffer->renderHeight;
+			pD3Ddevice->SetViewport(&vp);
+
+			HRESULT hr = pD3Ddevice->DrawPrimitiveUP(D3DPT_TRIANGLEFAN, 2, pos, (3 + 2) * sizeof(float));
+			if (FAILED(hr)) {
+				ERROR_LOG_REPORT(G3D, "Depal render failed: %08x", hr);
+			}
+
+			framebufferManager_->RebindFramebuffer();
+			fbo_bind_color_as_texture(depalFBO, 0);
+			dxstate.Restore();
+			dxstate.viewport.restore();
+
+			const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
+			const u32 clutBase = gstate.getClutIndexStartPos();
+			const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16);
+			const u32 clutExtendedColors = (clutTotalBytes_ / bytesPerColor) + clutBase;
+
+			TexCacheEntry::Status alphaStatus = CheckAlpha(clutBuf_, getClutDestFormat(gstate.getClutPaletteFormat()), clutExtendedColors, clutExtendedColors, 1);
+			gstate_c.textureFullAlpha = alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL;
+			gstate_c.textureSimpleAlpha = alphaStatus == TexCacheEntry::STATUS_ALPHA_SIMPLE;
 		} else {
-			pD3Ddevice->SetTexture(0, NULL);
-			gstate_c.skipDrawReason |= SKIPDRAW_BAD_FB_TEXTURE;
-		}
+			entry->status &= ~TexCacheEntry::STATUS_DEPALETTIZE;
+			framebufferManager_->BindFramebufferColor(0, framebuffer);
 
-		gstate_c.textureFullAlpha = gstate.getTextureFormat() == GE_TFMT_5650;
-		gstate_c.textureSimpleAlpha = gstate_c.textureFullAlpha;
+			gstate_c.textureFullAlpha = gstate.getTextureFormat() == GE_TFMT_5650;
+			gstate_c.textureSimpleAlpha = gstate_c.textureFullAlpha;
+		}
 
 		// Keep the framebuffer alive.
 		framebuffer->last_frame_used = gpuStats.numFlips;
diff --git a/GPU/Directx9/TextureCacheDX9.h b/GPU/Directx9/TextureCacheDX9.h
index cac576b0f3b3..67f8d1d0a346 100644
--- a/GPU/Directx9/TextureCacheDX9.h
+++ b/GPU/Directx9/TextureCacheDX9.h
@@ -32,6 +32,7 @@ struct VirtualFramebuffer;
 namespace DX9 {
 
 class FramebufferManagerDX9;
+class DepalShaderCacheDX9;
 class ShaderManagerDX9;
 
 enum TextureFiltering {
@@ -69,6 +70,9 @@ class TextureCacheDX9 : public TextureCacheCommon {
 	void SetFramebufferManager(FramebufferManagerDX9 *fbManager) {
 		framebufferManager_ = fbManager;
 	}
+	void SetDepalShaderCache(DepalShaderCacheDX9 *dpCache) {
+		depalShaderCache_ = dpCache;
+	}
 	void SetShaderManager(ShaderManagerDX9 *sm) {
 		shaderManager_ = sm;
 	}
@@ -223,6 +227,7 @@ class TextureCacheDX9 : public TextureCacheCommon {
 	int timesInvalidatedAllThisFrame_;
 
 	FramebufferManagerDX9 *framebufferManager_;
+	DepalShaderCacheDX9 *depalShaderCache_;
 	ShaderManagerDX9 *shaderManager_;
 };
 
diff --git a/GPU/GLES/DepalettizeShader.cpp b/GPU/GLES/DepalettizeShader.cpp
index c146d9296e75..ad935c88a05f 100644
--- a/GPU/GLES/DepalettizeShader.cpp
+++ b/GPU/GLES/DepalettizeShader.cpp
@@ -23,6 +23,7 @@
 #include "DepalettizeShader.h"
 #include "GPU/GPUState.h"
 #include "GPU/GLES/TextureCache.h"
+#include "GPU/Common/DepalettizeShaderCommon.h"
 
 static const int DEPAL_TEXTURE_OLD_AGE = 120;
 
@@ -107,216 +108,6 @@ DepalShaderCache::~DepalShaderCache() {
 	glDeleteShader(vertexShader_);
 }
 
-#define WRITE p+=sprintf
-
-// Uses integer instructions available since OpenGL 3.0. Suitable for ES 3.0 as well.
-void GenerateDepalShader300(char *buffer, GEBufferFormat pixelFormat) {
-	char *p = buffer;
-#ifdef USING_GLES2
-	WRITE(p, "#version 300 es\n");
-	WRITE(p, "precision mediump float;\n");
-#else
-	WRITE(p, "#version 330\n");
-#endif
-	WRITE(p, "in vec2 v_texcoord0;\n");
-	WRITE(p, "out vec4 fragColor0;\n");
-	WRITE(p, "uniform sampler2D tex;\n");
-	WRITE(p, "uniform sampler2D pal;\n");
-
-	WRITE(p, "void main() {\n");
-	WRITE(p, "  vec4 color = texture(tex, v_texcoord0);\n");
-
-	int mask = gstate.getClutIndexMask();
-	int shift = gstate.getClutIndexShift();
-	int offset = gstate.getClutIndexStartPos();
-	const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
-	// Unfortunately sampling turned our texture into floating point. To avoid this, might be able
-	// to declare them as isampler2D objects, but these require integer textures, which needs more work.
-	// Anyhow, we simply work around this by converting back to integer. Hopefully there will be no loss of precision.
-	// Use the mask to skip reading some components.
-	int shiftedMask = mask << shift;
-	switch (pixelFormat) {
-	case GE_FORMAT_8888:
-		if (shiftedMask & 0xFF) WRITE(p, "  int r = int(color.r * 255.99);\n"); else WRITE(p, "  int r = 0;\n");
-		if (shiftedMask & 0xFF00) WRITE(p, "  int g = int(color.g * 255.99);\n"); else WRITE(p, "  int g = 0;\n");
-		if (shiftedMask & 0xFF0000) WRITE(p, "  int b = int(color.b * 255.99);\n"); else WRITE(p, "  int b = 0;\n");
-		if (shiftedMask & 0xFF000000) WRITE(p, "  int a = int(color.a * 255.99);\n"); else WRITE(p, "  int a = 0;\n");
-		WRITE(p, "  int index = (a << 24) | (b << 16) | (g << 8) | (r);\n");
-		break;
-	case GE_FORMAT_4444:
-		if (shiftedMask & 0xF) WRITE(p, "  int r = int(color.r * 15.99);\n"); else WRITE(p, "  int r = 0;\n");
-		if (shiftedMask & 0xF0) WRITE(p, "  int g = int(color.g * 15.99);\n"); else WRITE(p, "  int g = 0;\n");
-		if (shiftedMask & 0xF00) WRITE(p, "  int b = int(color.b * 15.99);\n"); else WRITE(p, "  int b = 0;\n");
-		if (shiftedMask & 0xF000) WRITE(p, "  int a = int(color.a * 15.99);\n"); else WRITE(p, "  int a = 0;\n");
-		WRITE(p, "  int index = (a << 12) | (b << 8) | (g << 4) | (r);\n");
-		break;
-	case GE_FORMAT_565:
-		if (shiftedMask & 0x1F) WRITE(p, "  int r = int(color.r * 31.99);\n"); else WRITE(p, "  int r = 0;\n");
-		if (shiftedMask & 0x7E0) WRITE(p, "  int g = int(color.g * 63.99);\n"); else WRITE(p, "  int g = 0;\n");
-		if (shiftedMask & 0xF800) WRITE(p, "  int b = int(color.b * 31.99);\n"); else WRITE(p, "  int b = 0;\n");
-		WRITE(p, "  int index = (b << 11) | (g << 5) | (r);\n");
-		break;
-	case GE_FORMAT_5551:
-		if (shiftedMask & 0x1F) WRITE(p, "  int r = int(color.r * 31.99);\n"); else WRITE(p, "  int r = 0;\n");
-		if (shiftedMask & 0x3E0) WRITE(p, "  int g = int(color.g * 31.99);\n"); else WRITE(p, "  int g = 0;\n");
-		if (shiftedMask & 0x7C00) WRITE(p, "  int b = int(color.b * 31.99);\n"); else WRITE(p, "  int b = 0;\n");
-		if (shiftedMask & 0x8000) WRITE(p, "  int a = int(color.a);\n"); else WRITE(p, "  int a = 0;\n");
-		WRITE(p, "  int index = (a << 15) | (b << 10) | (g << 5) | (r);\n");
-		break;
-	default:
-		break;
-	}
-
-	float texturePixels = 256;
-	if (clutFormat != GE_CMODE_32BIT_ABGR8888)
-		texturePixels = 512;
-
-	if (shift) {
-		WRITE(p, "  index = ((index >> %i) & 0x%02x)", shift, mask);
-	} else {
-		WRITE(p, "  index = (index & 0x%02x)", mask);
-	}
-	if (offset) {
-		WRITE(p, " | %i;\n", offset);  // '|' matches what we have in gstate.h
-	} else {
-		WRITE(p, ";\n");
-	}
-
-	WRITE(p, "  fragColor0 = texture(pal, vec2((float(index) + 0.5) * (1.0 / %f), 0.0));\n", texturePixels);
-	WRITE(p, "}\n");
-}
-
-// FP only, to suit GL(ES) 2.0
-void GenerateDepalShader100(char *buffer, GEBufferFormat pixelFormat) {
-	char *p = buffer;
-
-	char lookupMethod[128] = "index.r";
-	char offset[128] = "";
-
-	const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
-	const u32 clutBase = gstate.getClutIndexStartPos();
-
-	const int shift = gstate.getClutIndexShift();
-	const int mask = gstate.getClutIndexMask();
-
-	float index_multiplier = 1.0f;
-	// pixelformat is the format of the texture we are sampling.
-	bool formatOK = true;
-	switch (pixelFormat) {
-	case GE_FORMAT_8888:
-		if ((mask & (mask + 1)) == 0) {
-			// If the value has all bits contiguous (bitmask check above), we can mod by it + 1.
-			const char *rgba = "rrrrrrrrggggggggbbbbbbbbaaaaaaaa";
-			const u8 rgba_shift = shift & 7;
-			if (rgba_shift == 0 && mask == 0xFF) {
-				sprintf(lookupMethod, "index.%c", rgba[shift]);
-			} else {
-				sprintf(lookupMethod, "mod(index.%c * %f, %d.0)", rgba[shift], 255.99f / (1 << rgba_shift), mask + 1);
-				index_multiplier = 1.0f / 256.0f;
-				// Format was OK if there weren't bits from another component.
-				formatOK = mask <= 255 - (1 << rgba_shift);
-			}
-		} else {
-			formatOK = false;
-		}
-		break;
-	case GE_FORMAT_4444:
-		if ((mask & (mask + 1)) == 0 && shift < 16) {
-			const char *rgba = "rrrrggggbbbbaaaa";
-			const u8 rgba_shift = shift & 3;
-			if (rgba_shift == 0 && mask == 0xF) {
-				sprintf(lookupMethod, "index.%c", rgba[shift]);
-				index_multiplier = 15.0f / 256.0f;
-			} else {
-				// Let's divide and mod to get the right bits.  A common case is shift=0, mask=01.
-				sprintf(lookupMethod, "mod(index.%c * %f, %d.0)", rgba[shift], 15.99f / (1 << rgba_shift), mask + 1);
-				index_multiplier = 1.0f / 256.0f;
-				formatOK = mask <= 15 - (1 << rgba_shift);
-			}
-		} else {
-			formatOK = false;
-		}
-		break;
-	case GE_FORMAT_565:
-		if ((mask & (mask + 1)) == 0 && shift < 16) {
-			const u8 shifts[16] = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4};
-			const int multipliers[16] = {31, 31, 31, 31, 31, 63, 63, 63, 63, 63, 63, 31, 31, 31, 31, 31};
-			const char *rgba = "rrrrrggggggbbbbb";
-			const u8 rgba_shift = shifts[shift];
-			if (rgba_shift == 0 && mask == multipliers[shift]) {
-				sprintf(lookupMethod, "index.%c", rgba[shift]);
-				index_multiplier = multipliers[shift] / 256.0f;
-			} else {
-				// We just need to divide the right component by the right value, and then mod against the mask.
-				// A common case is shift=1, mask=0f.
-				sprintf(lookupMethod, "mod(index.%c * %f, %d.0)", rgba[shift], ((float)multipliers[shift] + 0.99f) / (1 << rgba_shift), mask + 1);
-				index_multiplier = 1.0f / 256.0f;
-				formatOK = mask <= multipliers[shift] - (1 << rgba_shift);
-			}
-		} else {
-			formatOK = false;
-		}
-		break;
-	case GE_FORMAT_5551:
-		if ((mask & (mask + 1)) == 0 && shift < 16) {
-			const char *rgba = "rrrrrgggggbbbbba";
-			const u8 rgba_shift = shift % 5;
-			if (rgba_shift == 0 && mask == 0x1F) {
-				sprintf(lookupMethod, "index.%c", rgba[shift]);
-				index_multiplier = 31.0f / 256.0f;
-			} else if (shift == 15 && mask == 1) {
-				sprintf(lookupMethod, "index.%c", rgba[shift]);
-				index_multiplier = 1.0f / 256.0f;
-			} else {
-				// A isn't possible here.
-				sprintf(lookupMethod, "mod(index.%c * %f, %d.0)", rgba[shift], 31.99f / (1 << rgba_shift), mask + 1);
-				index_multiplier = 1.0f / 256.0f;
-				formatOK = mask <= 31 - (1 << rgba_shift);
-			}
-		} else {
-			formatOK = false;
-		}
-		break;
-	default:
-		break;
-	}
-
-	float texturePixels = 256.f;
-	if (clutFormat != GE_CMODE_32BIT_ABGR8888) {
-		texturePixels = 512.f;
-		index_multiplier *= 0.5f;
-	}
-
-	// Adjust index_multiplier, similar to the use of 15.99 instead of 16 in the ES 3 path.
-	// index_multiplier -= 0.01f / texturePixels;
-
-	if (!formatOK) {
-		ERROR_LOG_REPORT_ONCE(depal, G3D, "%i depal unsupported: shift=%i mask=%02x offset=%d", pixelFormat, shift, mask, clutBase);
-	}
-
-	// Offset by half a texel (plus clutBase) to turn NEAREST filtering into FLOOR.
-	float texel_offset = ((float)clutBase + 0.5f) / texturePixels;
-	sprintf(offset, " + %f", texel_offset);
-
-#ifdef USING_GLES2
-	WRITE(p, "#version 100\n");
-	WRITE(p, "precision mediump float;\n");
-#else
-	WRITE(p, "#version 110\n");
-#endif
-	WRITE(p, "varying vec2 v_texcoord0;\n");
-	WRITE(p, "uniform sampler2D tex;\n");
-	WRITE(p, "uniform sampler2D pal;\n");
-	WRITE(p, "void main() {\n");
-	WRITE(p, "  vec4 index = texture2D(tex, v_texcoord0);\n");
-	WRITE(p, "  float coord = (%s * %f)%s;\n", lookupMethod, index_multiplier, offset);
-	WRITE(p, "  gl_FragColor = texture2D(pal, vec2(coord, 0.0));\n");
-	WRITE(p, "}\n");
-}
-
-#undef WRITE
-
-
 u32 DepalShaderCache::GenerateShaderID(GEBufferFormat pixelFormat) {
 	return (gstate.clutformat & 0xFFFFFF) | (pixelFormat << 24);
 }
@@ -394,11 +185,7 @@ GLuint DepalShaderCache::GetDepalettizeShader(GEBufferFormat pixelFormat) {
 
 	char *buffer = new char[2048];
 
-	if (useGL3_) {
-		GenerateDepalShader300(buffer, pixelFormat);
-	} else {
-		GenerateDepalShader100(buffer, pixelFormat);
-	}
+	GenerateDepalShader(buffer, pixelFormat, useGL3_ ? GLSL_300 : GLSL_140);
 
 	GLuint fragShader = glCreateShader(GL_FRAGMENT_SHADER);
 
diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj
index e0178eac9c40..fd1886b67c54 100644
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@@ -181,6 +181,7 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="..\ext\xbrz\xbrz.h" />
+    <ClInclude Include="Common\DepalettizeShaderCommon.h" />
     <ClInclude Include="Common\DrawEngineCommon.h" />
     <ClInclude Include="Common\FramebufferCommon.h" />
     <ClInclude Include="Common\GPUDebugInterface.h" />
@@ -199,6 +200,7 @@
     <ClInclude Include="Common\VertexDecoderCommon.h" />
     <ClInclude Include="Debugger\Breakpoints.h" />
     <ClInclude Include="Debugger\Stepping.h" />
+    <ClInclude Include="Directx9\DepalettizeShaderDX9.h" />
     <ClInclude Include="Directx9\GPU_DX9.h" />
     <ClInclude Include="Directx9\helper\dx_state.h" />
     <ClInclude Include="Directx9\helper\fbo.h" />
@@ -239,6 +241,7 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\ext\xbrz\xbrz.cpp" />
+    <ClCompile Include="Common\DepalettizeShaderCommon.cpp" />
     <ClCompile Include="Common\DrawEngineCommon.cpp" />
     <ClCompile Include="Common\FramebufferCommon.cpp" />
     <ClCompile Include="Common\IndexGenerator.cpp" />
@@ -263,6 +266,7 @@
     <ClCompile Include="Common\VertexDecoderX86.cpp" />
     <ClCompile Include="Debugger\Breakpoints.cpp" />
     <ClCompile Include="Debugger\Stepping.cpp" />
+    <ClCompile Include="Directx9\DepalettizeShaderDX9.cpp" />
     <ClCompile Include="Directx9\GPU_DX9.cpp" />
     <ClCompile Include="Directx9\helper\dx_state.cpp" />
     <ClCompile Include="Directx9\helper\fbo.cpp" />
diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters
index 3826c43d2971..3ee6fe52ba58 100644
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@@ -177,6 +177,12 @@
     <ClInclude Include="Common\DrawEngineCommon.h">
       <Filter>Common</Filter>
     </ClInclude>
+    <ClInclude Include="Common\DepalettizeShaderCommon.h">
+      <Filter>Common</Filter>
+    </ClInclude>
+    <ClInclude Include="Directx9\DepalettizeShaderDX9.h">
+      <Filter>DirectX9</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="Math3D.cpp">
@@ -338,6 +344,12 @@
     <ClCompile Include="Directx9\StencilBufferDX9.cpp">
       <Filter>DirectX9</Filter>
     </ClCompile>
+    <ClCompile Include="Common\DepalettizeShaderCommon.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
+    <ClCompile Include="Directx9\DepalettizeShaderDX9.cpp">
+      <Filter>DirectX9</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <None Include="CMakeLists.txt" />
diff --git a/Qt/GPU.pro b/Qt/GPU.pro
index f14c017d2ef3..89c58a6bdd45 100644
--- a/Qt/GPU.pro
+++ b/Qt/GPU.pro
@@ -41,6 +41,7 @@ SOURCES += $$P/GPU/GeDisasm.cpp \ # GPU
 	$$P/GPU/GLES/VertexShaderGenerator.cpp \
 	$$P/GPU/Software/*.cpp \
 	$$P/GPU/Debugger/*.cpp \
+	$$P/GPU/Common/DepalettizeShaderCommon.cpp \
 	$$P/GPU/Common/IndexGenerator.cpp \
 	$$P/GPU/Common/TextureDecoder.cpp \
 	$$P/GPU/Common/VertexDecoderCommon.cpp \
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index 6735dabbebb7..d5b95304fff1 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -148,6 +148,7 @@ EXEC_AND_LIB_FILES := \
   $(SRC)/GPU/GPUCommon.cpp \
   $(SRC)/GPU/GPUState.cpp \
   $(SRC)/GPU/GeDisasm.cpp \
+  $(SRC)/GPU/Common/DepalettizeShaderCommon.cpp \
   $(SRC)/GPU/Common/FramebufferCommon.cpp \
   $(SRC)/GPU/Common/IndexGenerator.cpp.arm \
   $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \