Dictionary and Texture2D instances as a key.

looked at the code now, actually it shouldn’t use any dynamic branching since it’s using texture sampling…
Ergo the compiler is probably forced to flatten the whole structure and every pixel reads 16 textures.

Check the compiled shader to be sure, it’s easy with the VS graphics debugging tools

Please test with a lot of fullscreen quads and large textures.

By the way you could try to make different shader techniques for the different texture sizes and see if it has an impact on performance.

The limitation there would be that every texture needs to be the same size and format.

@kosmonautgames

That’s the decompile. On my machine runs the PS4.0 version, right? I see no difference from Reach to HiDef.
but the SM2 version will run slow on a DX9 card.

By the way, do you know if it’s possible to write SM3.0 shaders under PS_4_0_level_9_3 ?

//
// Generated by Microsoft (R) D3D Shader Disassembler
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_Position 0 xyzw 0 POS float
// COLOR 0 xyzw 1 NONE float xyzw
// TEXCOORD 0 xyz 2 NONE float xyz
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_Target 0 xyzw 0 TARGET float xyzw
//
//
// Sampler/Resource to DX9 shader sampler mappings:
//
// Target Sampler Source Sampler Source Resource
// -------------- --------------- ----------------
// s0 s0 t0
// s1 s1 t1
// s2 s2 t2
// s3 s3 t3
// s4 s4 t4
// s5 s5 t5
// s6 s6 t6
// s7 s7 t7
// s8 s10 t10
// s9 s11 t11
// s10 s12 t12
// s11 s13 t13
// s12 s14 t14
// s13 s15 t15
//
//
// Level9 shader bytecode:
//
ps_2_0
def c0, 7, 3, 1, -2
def c1, 5, -4, -6, 11
def c2, -10, 13, -12, -14
dcl t0
dcl t1.xyz
dcl_2d s0
dcl_2d s1
dcl_2d s2
dcl_2d s3
dcl_2d s4
dcl_2d s5
dcl_2d s6
dcl_2d s7
dcl_2d s8
dcl_2d s9
dcl_2d s10
dcl_2d s11
dcl_2d s12
dcl_2d s13
0: texld r0, t1, s4
0: texld r1, t1, s5
0: texld r2, t1, s6
0: texld r3, t1, s7
0: texld r4, t1, s2
0: texld r5, t1, s3
0: texld r6, t1, s0
0: texld r7, t1, s1
0: texld r8, t1, s10
0: texld r9, t1, s11
0: add r10.w, t1.z, c1.y
1: mul r10.x, r10.w, r10.w
2: mul r0, r0, t0
3: mul r1, r1, t0
4: cmp r0, -r10.x, r0, r1
5: add r1.x, t1.z, c1.z
6: mul r1.x, r1.x, r1.x
7: mul r2, r2, t0
8: mul r3, r3, t0
9: cmp r1, -r1.x, r2, r3
10: add r2.x, -t1.z, c1.x
11: cmp r0, r2.x, r0, r1
12: add r1.x, t1.z, c0.w
13: mul r1.x, r1.x, r1.x
14: mul r2, r4, t0
15: mul r3, r5, t0
16: cmp r1, -r1.x, r2, r3
17: mul r2, r6, t0
18: mul r3, r7, t0
19: mul r4.x, t1.z, t1.z
20: cmp r2, -r4.x, r2, r3
21: add r3.x, -t1.z, c0.z
22: cmp r1, r3.x, r2, r1
23: add r2.x, -t1.z, c0.y
24: cmp r0, r2.x, r1, r0
25: add r1.x, t1.z, c2.z
26: mul r1.x, r1.x, r1.x
27: mul r2, r8, t0
28: mul r3, r9, t0
29: cmp r1, -r1.x, r2, r3
30: add r2.x, t1.z, c2.w
31: mul r2.x, r2.x, r2.x
32: texld r3, t1, s12
32: texld r4, t1, s13
32: texld r5, t1, s8
32: texld r6, t1, s9
32: mul r3, r3, t0
33: mul r4, r4, t0
34: cmp r2, -r2.x, r3, r4
35: add r3.x, -t1.z, c2.y
36: cmp r1, r3.x, r1, r2
37: add r2.x, t1.z, c2.x
38: mul r2.x, r2.x, r2.x
39: mul r3, r5, t0
40: mul r4, r6, t0
41: cmp r2, -r2.x, r3, r4
42: add r3.x, -t1.z, c1.w
43: cmp r1, r3.x, r2, r1
44: add r2.x, -t1.z, c0.x
45: cmp r0, r2.x, r0, r1
46: mov oC0, r0

// approximately 61 instruction slots used (14 texture, 47 arithmetic)
ps_4_0
dcl_constantbuffer CB0[1], immediateIndexed
dcl_sampler s0, mode_default
dcl_sampler s1, mode_default
dcl_sampler s2, mode_default
dcl_sampler s3, mode_default
dcl_sampler s4, mode_default
dcl_sampler s5, mode_default
dcl_sampler s6, mode_default
dcl_sampler s7, mode_default
dcl_sampler s10, mode_default
dcl_sampler s11, mode_default
dcl_sampler s12, mode_default
dcl_sampler s13, mode_default
dcl_sampler s14, mode_default
dcl_sampler s15, mode_default
dcl_resource_texture2d (float,float,float,float) t0
dcl_resource_texture2d (float,float,float,float) t1
dcl_resource_texture2d (float,float,float,float) t2
dcl_resource_texture2d (float,float,float,float) t3
dcl_resource_texture2d (float,float,float,float) t4
dcl_resource_texture2d (float,float,float,float) t5
dcl_resource_texture2d (float,float,float,float) t6
dcl_resource_texture2d (float,float,float,float) t7
dcl_resource_texture2d (float,float,float,float) t10
dcl_resource_texture2d (float,float,float,float) t11
dcl_resource_texture2d (float,float,float,float) t12
dcl_resource_texture2d (float,float,float,float) t13
dcl_resource_texture2d (float,float,float,float) t14
dcl_resource_texture2d (float,float,float,float) t15
dcl_input_ps linear v1.xyzw
dcl_input_ps linear v2.xyz
dcl_output o0.xyzw
dcl_temps 1
0: ge r0.x, l(7.000000), v2.z
1: if_nz r0.x
2: ge r0.x, l(3.000000), v2.z
3: if_nz r0.x
4: ge r0.x, l(1.000000), v2.z
5: if_nz r0.x
6: eq r0.x, v2.z, l(0.000000)
7: if_nz r0.x
8: sample r0.xyzw, v2.xyxx, t0.xyzw, s0
9: mul o0.xyzw, r0.xyzw, v1.xyzw
10: ret
11: else
12: sample r0.xyzw, v2.xyxx, t1.xyzw, s1
13: mul o0.xyzw, r0.xyzw, v1.xyzw
14: ret
15: endif
16: else
17: eq r0.x, v2.z, l(2.000000)
18: if_nz r0.x
19: sample r0.xyzw, v2.xyxx, t2.xyzw, s2
20: mul o0.xyzw, r0.xyzw, v1.xyzw
21: ret
22: else
23: sample r0.xyzw, v2.xyxx, t3.xyzw, s3
24: mul o0.xyzw, r0.xyzw, v1.xyzw
25: ret
26: endif
27: endif
28: else
29: ge r0.x, l(5.000000), v2.z
30: if_nz r0.x
31: eq r0.x, v2.z, l(4.000000)
32: if_nz r0.x
33: sample r0.xyzw, v2.xyxx, t4.xyzw, s4
34: mul o0.xyzw, r0.xyzw, v1.xyzw
35: ret
36: else
37: sample r0.xyzw, v2.xyxx, t5.xyzw, s5
38: mul o0.xyzw, r0.xyzw, v1.xyzw
39: ret
40: endif
41: else
42: eq r0.x, v2.z, l(6.000000)
43: if_nz r0.x
44: sample r0.xyzw, v2.xyxx, t6.xyzw, s6
45: mul o0.xyzw, r0.xyzw, v1.xyzw
46: ret
47: else
48: sample r0.xyzw, v2.xyxx, t7.xyzw, s7
49: mul o0.xyzw, r0.xyzw, v1.xyzw
50: ret
51: endif
52: endif
53: endif
54: else
55: ge r0.x, l(11.000000), v2.z
56: if_nz r0.x
57: eq r0.x, v2.z, l(10.000000)
58: if_nz r0.x
59: sample r0.xyzw, v2.xyxx, t10.xyzw, s10
60: mul o0.xyzw, r0.xyzw, v1.xyzw
61: ret
62: else
63: sample r0.xyzw, v2.xyxx, t11.xyzw, s11
64: mul o0.xyzw, r0.xyzw, v1.xyzw
65: ret
66: endif
67: else
68: ge r0.x, l(13.000000), v2.z
69: if_nz r0.x
70: eq r0.x, v2.z, l(12.000000)
71: if_nz r0.x
72: sample r0.xyzw, v2.xyxx, t12.xyzw, s12
73: mul o0.xyzw, r0.xyzw, v1.xyzw
74: ret
75: else
76: sample r0.xyzw, v2.xyxx, t13.xyzw, s13
77: mul o0.xyzw, r0.xyzw, v1.xyzw
78: ret
79: endif
80: else
81: eq r0.x, v2.z, l(14.000000)
82: if_nz r0.x
83: sample r0.xyzw, v2.xyxx, t14.xyzw, s14
84: mul o0.xyzw, r0.xyzw, v1.xyzw
85: ret
86: else
87: sample r0.xyzw, v2.xyxx, t15.xyzw, s15
88: mul o0.xyzw, r0.xyzw, v1.xyzw
89: ret
90: endif
91: endif
92: endif
93: endif
94: ret
// Approximately 0 instruction slots used

What do you do when there is less then 16 textures wont it error out due to the aggressive optimization on the fx files.

Im so bad with shaders in monogame, i can’t even test it. but i guess since all the textures are loaded you could do sort of texture splatting.

You could probably get around using if’s at all by just turning the conditionals into a straight zero multiply operation via a dot and trunc
With a extra value on the vertice structure, though i dunno if that’s any better then a if. 15 of the textures would by adding 0 just to get one textures full value.

1 Like