Dictionary and Texture2D instances as a key.

Yes, it’s flushed when the texture changes. But by setting SpriteSortMode.Texture in your SpriteBatch.Begin call you can let it sort by texture first.

About your dictionary thing. If you want to associate 1 value with each texture you can just wrap that texture and the value in a class. It doesn’t really make sense to use a dictionary in this case.

How many textures are you thinking of using? I doubt it will be significant to be concerned about collisions. Though why even use a Dictionary<TKey, TValue>? What are you trying to do with the textures exactly? It sounds like you are using the wrong type of data structure for your problem.

Yes, it’s flushed when the texture changes. But by setting SpriteSortMode.Texture in your SpriteBatch.Begin call you can let it sort by texture first.

So if you spritebatch draw texture A then B then A how can it sort with just one buffer efficiently ?

Anyways its just a test to see if i can make a sort of 2d particle system because i’ve never tried. So probably a lot of textures and a lot of quads with my single texture test i was pushing 25,000.

My idea was to have a bunch of buffers of PositionColorTexture. One for each texture as it is used. Then as i send in each. The dictionary would drop the call right into the correct buffer so that if i did have a ton of textures. No sorting all would be needed. The problem with wraping the texture is it accepts parameters just like spritebatch so they wont be coming in wrapped.

In draw it would make a DrawAll call loop the buffers and drawprimitive each. then reset the index pointers for all of them to 0 all in one shot.

Well that was the idea but its sort of something that was a test.
Though my other version actually works on a single texture pretty well i just couldn’t get my head around how to do it fast for bunches of individual textures without some big sorting penalty so its what i came up with.

I made it already but it still needs some work you can have a look at what i got so far im open to suggestions.

using System;
using System.Collections.Generic;
using Microsoft.Xna.Framework;
using Microsoft.Xna.Framework.Graphics;

namespace MyDirectSpriteBatcher
{
    public class TextureDataBufferMap
    {
        private Dictionary<Texture2D, DrawableTextureQuadDataBuffer> d = new Dictionary<Texture2D, DrawableTextureQuadDataBuffer>();
        private List<Texture2D> texturesList = new List<Texture2D>();
        private DrawableTextureQuadDataBuffer cb = new DrawableTextureQuadDataBuffer();
        public int InitialQuadCapacity = 256;
        private const int MaxQuadCapacity = short.MaxValue / 6;

        #region Default UV values
        private Vector2 uvLT = new Vector2(0f, 0f);
        private Vector2 uvLB = new Vector2(0f, 1f);
        private Vector2 uvRT = new Vector2(1f, 0f);
        private Vector2 uvRB = new Vector2(1f, 1f);
        #endregion
        private Effect currentEffect;
        private Texture2D currentTexture;
        private int windowWidth = 0;
        private int windowHeight = 0;
        private float cw = 150;
        private float ch = 150;

        public TextureDataBufferMap(Rectangle windowClientBounds)
        {
            windowWidth = windowClientBounds.Width;
            windowHeight = windowClientBounds.Height;
            cw = 2f / windowWidth;
            ch = 2f / windowHeight;
        }
        public TextureDataBufferMap(Rectangle windowClientBounds, Effect effectToUse)
        {
            windowWidth = windowClientBounds.Width;
            windowHeight = windowClientBounds.Height;
            cw = 2f / windowWidth;
            ch = 2f / windowHeight;
            InitialQuadCapacity = Math.Min(InitialQuadCapacity, MaxQuadCapacity);
            currentEffect = effectToUse;
        }

        /// <summary>
        /// spritebatch like version
        /// </summary>
        public void SetSpriteToBatch(
            Texture2D texture,
            Rectangle destinationPositionRectangle,
            Rectangle sourceTextureRectangle,
            Color color,
            float rotation,
            Vector2 scale,
            float depth
            )
        {
            var _LT = new Vector2(destinationPositionRectangle.Left, destinationPositionRectangle.Top);
            var _LB = new Vector2(destinationPositionRectangle.Left, destinationPositionRectangle.Bottom);
            var _RT = new Vector2(destinationPositionRectangle.Right, destinationPositionRectangle.Top);
            var _RB = new Vector2(destinationPositionRectangle.Right, destinationPositionRectangle.Bottom);
            var u = 1f / texture.Width;
            var v = 1f / texture.Height;
            var uvL = (float)sourceTextureRectangle.Left * u;
            var uvR = (float)sourceTextureRectangle.Right * u;
            var uvT = (float)sourceTextureRectangle.Top * v;
            var uvB = (float)sourceTextureRectangle.Bottom * v;
            var uv0 = new Vector2(uvL, uvT);
            var uv1 = new Vector2(uvL, uvB);
            var uv2 = new Vector2(uvR, uvT);
            var uv3 = new Vector2(uvR, uvB);
            SetSpriteToBatch(texture, _LT, _LB, _RT, _RB , uv0, uv1, uv2, uv3,color,rotation,scale,depth);
        }
        /// <summary>
        /// primary call
        /// </summary>
        public void SetSpriteToBatch(
            Texture2D texture, 
            Vector2 _LT, Vector2 _LB, Vector2 _RT, Vector2 _RB,
            Vector2 uv0, Vector2 uv1, Vector2 uv2, Vector2 uv3, 
            Color color, float rotation, Vector2 scale, float depth
            )
        {
            // Projection to gpu graphing coordinates 
            // Requires the window width height be updated on a resize or fullscreen change... 
            // Via hooking onwindowsclientsizechanged.
            float cw = 2f / windowWidth;
            float ch = 2f / windowHeight;
            // If we really want to just transform from top left to bottom right like xna yuck. 
            // Then we can  instead just set the local origin to _LT
            Vector2 origin = (_LT + _LB + _RT+ _RB) * .25f;
            // translate to the local origin and scale
            var lt = (_LT - origin) * scale;
            var lb = (_LB - origin) * scale;
            var rt = (_RT - origin) * scale;
            var rb = (_RB - origin) * scale;
            // rotate
            if (rotation != 0)
            {
                Vector2 q = new Vector2((float)Math.Sin(rotation), (float)Math.Cos(rotation));
                lt = new Vector2(lt.X * q.Y - lt.Y * q.X, lt.X * q.X + lt.Y * q.Y);
                lb = new Vector2(lb.X * q.Y - lb.Y * q.X, lb.X * q.X + lb.Y * q.Y);
                rt = new Vector2(rt.X * q.Y - rt.Y * q.X, rt.X * q.X + rt.Y * q.Y);
                rb = new Vector2(rb.X * q.Y - rb.Y * q.X, rb.X * q.X + rb.Y * q.Y);
            }
            // de-originate and project
            var LT = new Vector3((lt.X + origin.X) * cw - 1f, (lt.Y + origin.Y) * -ch + 1f, depth);
            var LB = new Vector3((lb.X + origin.X) * cw - 1f, (lb.Y + origin.Y) * -ch + 1f, depth);
            var RT = new Vector3((rt.X + origin.X) * cw - 1f, (rt.Y + origin.Y) * -ch + 1f, depth);
            var RB = new Vector3((rb.X + origin.X) * cw - 1f, (rb.Y + origin.Y) * -ch + 1f, depth);
            
            // switch to texture
            if (!System.Object.ReferenceEquals(texture, currentTexture))
            {
                SetCurrentTextureDrawBufferTo(texture, ref cb);
            }
            // create the vertice quad
            cb.spriteVertices[cb.vi_pointer + 0].Position = LT;
            cb.spriteVertices[cb.vi_pointer + 0].Color = color;
            cb.spriteVertices[cb.vi_pointer + 0].TextureCoordinate = uv0;

            cb.spriteVertices[cb.vi_pointer + 1].Position = LB;
            cb.spriteVertices[cb.vi_pointer + 1].Color = color;
            cb.spriteVertices[cb.vi_pointer + 1].TextureCoordinate = uv1;

            cb.spriteVertices[cb.vi_pointer + 2].Position = RT;
            cb.spriteVertices[cb.vi_pointer + 2].Color = color;
            cb.spriteVertices[cb.vi_pointer + 2].TextureCoordinate = uv2;

            cb.spriteVertices[cb.vi_pointer + 3].Position = RB;
            cb.spriteVertices[cb.vi_pointer + 3].Color = color;
            cb.spriteVertices[cb.vi_pointer + 3].TextureCoordinate = uv3;

            // create the indexs im not sure this indexing is not simply redundant overhead
            // p = 3x2 c is 4 tc is 4 = 14  indexs are 6 x 4 =24. 
            // seems like regular primitives would be woth trying it would cost 4bytes but save the look up per quad
            //
            // LT 0   2 RT
            //    |  /|     Triangle 1 is 0 1 2  ccw
            //    | / |     Triangle 2 is 2 1 3  ccw
            // LB 1   3 RB
            cb.triangleIndexList[cb.ti_pointer + 0] = 0 + cb.vi_pointer;
            cb.triangleIndexList[cb.ti_pointer + 1] = 1 + cb.vi_pointer;
            cb.triangleIndexList[cb.ti_pointer + 2] = 2 + cb.vi_pointer;
            cb.triangleIndexList[cb.ti_pointer + 3] = 2 + cb.vi_pointer;
            cb.triangleIndexList[cb.ti_pointer + 4] = 1 + cb.vi_pointer;
            cb.triangleIndexList[cb.ti_pointer + 5] = 3 + cb.vi_pointer;

            cb.currentQuads += 1;
            cb.vi_pointer += 4;
            cb.ti_pointer += 6;

            if (cb.currentQuads >= cb.quadCapacity - 1)
            {
                IncreaseCapacity();
            }
        }

        public void DrawAll(GraphicsDevice gd, bool allbuffers)
        {
            for (int i = 0; i < texturesList.Count; i++)
            {
                currentTexture = texturesList[i];
                currentEffect.Parameters["Texture"].SetValue(currentTexture);
                SetCurrentTextureDrawBufferTo(currentTexture, ref cb);
                if (cb.TriangleDrawCount() > 0)
                {
                    foreach (EffectPass pass in currentEffect.CurrentTechnique.Passes)
                    {
                        pass.Apply();
                        gd.DrawUserIndexedPrimitives
                            (
                            PrimitiveType.TriangleList,
                            cb.spriteVertices,
                            0,
                            cb.VerticesPerQuad(),
                            cb.triangleIndexList,
                            0,
                            cb.TriangleDrawCount()
                            );
                    }
                }
            }
        }
        public void ClearAll()
        {
            for (int i = 0; i < texturesList.Count; i++)
            {
                SetCurrentTextureDrawBufferTo(texturesList[i], ref cb);
                cb.currentQuads = 0;
                cb.vi_pointer = 0;
                cb.ti_pointer = 0;
            }
        }
        public void SetCurrentEffect(Effect effectToUse)
        {
            currentEffect = effectToUse;
        }
        public void OnResizeUpdateWindowWidthHeight(Rectangle windowClientBounds)
        {
            windowWidth = windowClientBounds.Width;
            windowHeight = windowClientBounds.Height;
            cw = 2f / windowWidth;
            ch = 2f / windowHeight;
        }
        private void SetCurrentTextureDrawBufferTo(Texture2D t, ref DrawableTextureQuadDataBuffer b)
        {
            if (d.ContainsKey(t) == false)
            {
                texturesList.Add(t);
                DrawableTextureQuadDataBuffer n = new DrawableTextureQuadDataBuffer();
                n.quadCapacity = InitialQuadCapacity;
                n.spriteVertices = new VertexPositionColorTexture[InitialQuadCapacity * 4];
                n.triangleIndexList = new int[InitialQuadCapacity * 6];
                d.Add(t, n);
            }
            currentTexture = t;
            b = d[t];
        }
        private void IncreaseCapacity()
        {
            int newVerticeCapacity = cb.spriteVertices.Length + InitialQuadCapacity * 4;
            int newIndexCapacity = cb.triangleIndexList.Length + InitialQuadCapacity * 6;
            VertexPositionColorTexture[] v = new VertexPositionColorTexture[newVerticeCapacity];
            int[] ind = new int[newIndexCapacity];
            Array.Copy(cb.spriteVertices, v, cb.spriteVertices.Length);
            Array.Copy(cb.triangleIndexList, ind, cb.triangleIndexList.Length);
            // i gotta count the bytes ill do it later
            //Buffer.BlockCopy(cb.spriteVertices, 0, v, 0, cb.currentQuads * 14);
            //Buffer.BlockCopy(cb.triangleIndexList, 0, ind, 0, cb.triangleIndexList.Length * 4);
            cb.spriteVertices = v;
            cb.triangleIndexList = ind;
            cb.quadCapacity += InitialQuadCapacity;
        }
    }
    public class DrawableTextureQuadDataBuffer
    {
        //public Texture2D texture;
        public int quadCapacity;
        public int currentQuads;
        public int ti_pointer;
        public int vi_pointer;
        public int[] triangleIndexList; // = new int[6];
        public VertexPositionColorTexture[] spriteVertices; // = new VertexPositionColorTexture[4];
        // methods
        public int TotalQuads()
        {
            return currentQuads;
        }
        public int VerticesPerQuad()
        {
            return 4;
        }
        public int TriangleDrawCount()
        {
            return currentQuads * 2;
        }
        public int TotalVertices() { return spriteVertices.Length; }
        public int TotalIndices() { return triangleIndexList.Length; }
    }
    
}

That sounds good. You don’t have to worry about collisions.

Another idea would be to add an index to your Vertex and fill up the device.textures as you add particles. Add some extra code to your shader to pick the right texture.
Monogame allow up to 4 textures so you can batch 4 textures in a single Draw call. (can we max input slots up to 16?)

As for SpriteBatch we could have multiple implementations of SpriteBatcher and initialize the right one based on SortMethod (strategy patern).
For SortMode.Texture we can do something similar to your idea to avoid sorting, although we can’t write directly to a vertexBuffer. We might need to flash multiple times inbetween Begin()/End() and that’s a breaking change.

Note that this wont help SortMode.FrontToBack. The complexity of keeping items sorted (SortedList?) is the same as sorting them at the end. It could be faster in practise of course but only by some percentage.
SortMode.Texture on the other hand don’t really care about order, all we want is to group them and a HashTable will do just fine.

Yet another idea is to keep the number a texture is used. Before each batch calculate the offset of each texture group in the buffer. As you batch, increase the offset and decrease the count number of each batched texture. Multiple passes are needed to account for buffer size limit, you have to skip particles whose offset if > bufferSize.

What do you mean with 4 textures? In the default spritebatch right?

Basically the same thing as multi-texturing, that is really smart didn’t even think about that.

What do you mean with 4 textures? In the default spritebatch?

This isn’t spriteBatch though this just partially mimics it. Like i basically scrunched most of it down to the above code without many of the extras.

To say you wouldn’t call begin or end on this because i skipped most of the extra stuff it does. It just straight drop sprites to the screen via rectangle color rotation ect…

So anyways he’s saying i could dump in 4 textures to the gpu with a custom vertex structure, a extra feild like for which texture to use, like a float or int on the custom vertex format data itself. A shader can take 4 textures and you could make 4 x texture vertice format to use them all. But you could just drop in a int to switch which texture you want and send in all 4 textures.

As a effect that is known as multitexturing.

    public struct VertexMultitextured
    {
        public Vector3 Position;
        public Vector3 Normal;
        public Vector4 TextureCoordinate;
        public Vector4 TexWeights; // could be for a second tex coordinate uv
        // public float WhichTexture
        // 14 * 4 bytes = 56 for a cube with 36 vertices 2015 
        // just under 2 kilobytes for a awsome background win win
        public static int SizeInBytes = (3 + 3 + 4 + 4) * sizeof(float);
        public static VertexElement[] VertexElements = new VertexElement[]
        {
          new VertexElement(  0, VertexElementFormat.Vector3, VertexElementUsage.Position, 0 ),
          new VertexElement(  sizeof(float) * 3, VertexElementFormat.Vector3, VertexElementUsage.Normal, 0 ),
          new VertexElement(  sizeof(float) * 6, VertexElementFormat.Vector4, VertexElementUsage.TextureCoordinate, 0 ),
          new VertexElement(  sizeof(float) * 10, VertexElementFormat.Vector4, VertexElementUsage.TextureCoordinate, 1 ),
        };
    }

But ya it just calls DrawIndexPrimitives.
You can just stick that below a game1 with a empty effect that takes a texture. And set it up and that’s about it it’ll draw. Everything has been pre-calculated to screenspace. Though i literally just wrote it out the last couple days and its minimalistic.

Like e.g. this is my game1 draw call. I loaded up the set calls in update.

        protected override void Draw(GameTime gameTime)
        {
            //GraphicsDevice.Clear(Color.CornflowerBlue);
            GraphicsDevice.Clear(Color.TransparentBlack);
            GraphicsDevice.RasterizerState = RasterizerState.CullNone;
            GraphicsDevice.BlendState = Blendstate;

            switch (whichTest)
            {
                case (1):
                    spriteBufferMap.DrawAll(GraphicsDevice, true);
                    break;
                case (2):
                    spriteBufferMap.DrawAll(GraphicsDevice, true);
                    break;
                default:
                    spriteBufferMap.DrawAll(GraphicsDevice, true);
                    break;
            }
 
            //
            // actual mono game spritebatch draw calls.
            //
            spriteBatch.Begin();
            framerate.Draw(spriteBatch, gameTime);
            msb.Clear();
            msb.Append(" Buffer limit: "); msb.Append(itemTest.bufferLimit);
            msb.AppendLine(" Total Draw count: "); msb.Append(totaldrawn);
            msb.Append("\n Dead Marker: "); msb.Append(itemTest.DeadMarker);
            msb.Append("\n Live Marker: "); msb.Append(itemTest.AliveMarker);
            msb.Append("\n Lastchecked: "); msb.Append(itemTest.lastChecked);
            msb.Append("\n Last dead: "); msb.Append(itemTest.lastDead);    
            msb.Append("\n numberOfTotalCreates: "); msb.Append(itemTest.numberTotalCreates);
            msb.Append("\n numberOfTotalDead: "); msb.Append(itemTest.numberTotalDeletes);
            Vector2 linePos = new Vector2(30, 300);
            spriteBatch.DrawString(font, msb, linePos, Color.WhiteSmoke);
            spriteBatch.End();

            base.Draw(gameTime);
        }

Though im getting bored of messing with this even though its kinda cool its starting to give me a headache :frowning:

I mean the GraphicsDevice.Textures collection. DX9.x support up to 16 input slots, that’s the textures, right? Haven’t tried to bind more than 4 but there’s a hardcoded limit of 4 somewhere. Maybe it’s an XNA limit, don’t really know.

Haven’t tried this trick with SpriteBatch yet. I am sure it will fail a couple of tests cause XNA’s spritebatch only overrides the first texture.

i bind more than 4 all the time, pretty sure that’s not an issue. Am on dx11

AlbedoMap,
NormalMap,
Roughness,
Metallic,
ShadowMap

and we are over 4

Uh, ok! That’s cool.
Maybe it’s an optimization then or something else, my mistake.

The max of 4 was for render targets, not input textures.

That’s the code I remember from TextureCollection,

It’s indeed about rendertargets and not input textures.

Really 16 ?

So whats the safe low limit between gl dx and the different vs ps levels like common denominator wise?

Because with that many like 16 i could probably switch back to my old single buffers way which is about twice as fast then again maybe not that alone would probably speed this up if i can get it into shape. Maybe even think about using this for bill-boarding distant stuff as well.

The content manager doesn’t internally assign each texture a id count as it loads them in does it ?
Like maybe sticks them in a contiguous list or array ? Because then if i could get a texture id from a texture basically its index, i could sort them like that with buffers corresponding to those index ranges and texture limits, eg tid = 5 then (int)( tid / 16 texture) = buffer 1
ill bet it would be much faster. Though for particles i could probably squeeze a ton onto a single sheet and i doubt i would need more then 16 for just that.

Though now that i think about it if i did make a wrapper more like a class that you drop all your textures into before hand that this uses this and get a id back it might make it a lot faster but it would be more complicated to use.

Well first thing i think i need to do is make a better test. My test class creates so much garbage i cant tell when im making it better or screwing it up… Then ill take a another shot i might as well try it on both since i have the single buffer version as well.

Well, you could just combine all those individual textures into one texture using the idea of a texture atlas and be done with it. Then you only need one vertex buffer and one index buffer. Also you would end up with one draw call in the end which is important in not-next-gen graphics API.

9.1, 9.2, 9.3 & 10.0 cards support 16 input slots.

I wanted to test that idea for some time now, so this morning I tried a little test. Was disappointed at first, because it looks like we need TextureArrays to get this right. TextureArrays are supported only on HiDef/dx10.0.

I went ahead anyway just to see how much performance we can squeeze out of batching textures this way. Nothing fancy with hashtables and groups, I just increment an index and reset/flush whenever the index >= 16.

Well, the shader looks ugly and slow with all those conditional branches and still haven’t figured out how to handle custom effects (need to reapply textures after effect.apply), but the first results are very promising…

I tested with this code

        this.fpsCounter.StartDrawTimer();
        this.spriteBatch.Begin(SpriteSortMode.Deferred, null);
        fpsCounter.StartPerfTimer();
        for (int i = 0; i < amount; i++)
        {
            spriteBatch.Draw(_texture, new Vector2(random.Next(screenSize), random.Next(screenSize)), Color.Red);
            spriteBatch.Draw(_tx1, new Rectangle(random.Next(screenSize), random.Next(screenSize), 64,64), Color.White);
        }
        fpsCounter.EndPerfTimer();
        this.spriteBatch.End();
        this.fpsCounter.EndDrawTimer(gameTime);

Batch 2 textures : 205%
Batch 4 textures : 321%
Batch 8 textures : 458%
Batch 16 textures : 563%

From the first test with 2 textures/flush, draw time was twice at fast.
Final test with 16 textures was x5.5 times faster! :smile:

Interestingly enough, after 2-4 textures the frame rate stuck at 480fps. Is it possible that I hit GPU bottleneck?

That’s worth implementing then

I’am still a bit worried about the shader. Other GPUs might not be as good with branching.

edit
Also haven’t measured yet how it affect cases where you use a single texture/atlas or on different sortmodes.

looked at the code now, actually it shouldn’t use any dynamic branching since it’s using texture sampling…
Ergo the compiler is probably forced to flatten the whole structure and every pixel reads 16 textures.

Check the compiled shader to be sure, it’s easy with the VS graphics debugging tools

Please test with a lot of fullscreen quads and large textures.

By the way you could try to make different shader techniques for the different texture sizes and see if it has an impact on performance.

The limitation there would be that every texture needs to be the same size and format.

@kosmonautgames

That’s the decompile. On my machine runs the PS4.0 version, right? I see no difference from Reach to HiDef.
but the SM2 version will run slow on a DX9 card.

By the way, do you know if it’s possible to write SM3.0 shaders under PS_4_0_level_9_3 ?

//
// Generated by Microsoft (R) D3D Shader Disassembler
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_Position 0 xyzw 0 POS float
// COLOR 0 xyzw 1 NONE float xyzw
// TEXCOORD 0 xyz 2 NONE float xyz
//
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// SV_Target 0 xyzw 0 TARGET float xyzw
//
//
// Sampler/Resource to DX9 shader sampler mappings:
//
// Target Sampler Source Sampler Source Resource
// -------------- --------------- ----------------
// s0 s0 t0
// s1 s1 t1
// s2 s2 t2
// s3 s3 t3
// s4 s4 t4
// s5 s5 t5
// s6 s6 t6
// s7 s7 t7
// s8 s10 t10
// s9 s11 t11
// s10 s12 t12
// s11 s13 t13
// s12 s14 t14
// s13 s15 t15
//
//
// Level9 shader bytecode:
//
ps_2_0
def c0, 7, 3, 1, -2
def c1, 5, -4, -6, 11
def c2, -10, 13, -12, -14
dcl t0
dcl t1.xyz
dcl_2d s0
dcl_2d s1
dcl_2d s2
dcl_2d s3
dcl_2d s4
dcl_2d s5
dcl_2d s6
dcl_2d s7
dcl_2d s8
dcl_2d s9
dcl_2d s10
dcl_2d s11
dcl_2d s12
dcl_2d s13
0: texld r0, t1, s4
0: texld r1, t1, s5
0: texld r2, t1, s6
0: texld r3, t1, s7
0: texld r4, t1, s2
0: texld r5, t1, s3
0: texld r6, t1, s0
0: texld r7, t1, s1
0: texld r8, t1, s10
0: texld r9, t1, s11
0: add r10.w, t1.z, c1.y
1: mul r10.x, r10.w, r10.w
2: mul r0, r0, t0
3: mul r1, r1, t0
4: cmp r0, -r10.x, r0, r1
5: add r1.x, t1.z, c1.z
6: mul r1.x, r1.x, r1.x
7: mul r2, r2, t0
8: mul r3, r3, t0
9: cmp r1, -r1.x, r2, r3
10: add r2.x, -t1.z, c1.x
11: cmp r0, r2.x, r0, r1
12: add r1.x, t1.z, c0.w
13: mul r1.x, r1.x, r1.x
14: mul r2, r4, t0
15: mul r3, r5, t0
16: cmp r1, -r1.x, r2, r3
17: mul r2, r6, t0
18: mul r3, r7, t0
19: mul r4.x, t1.z, t1.z
20: cmp r2, -r4.x, r2, r3
21: add r3.x, -t1.z, c0.z
22: cmp r1, r3.x, r2, r1
23: add r2.x, -t1.z, c0.y
24: cmp r0, r2.x, r1, r0
25: add r1.x, t1.z, c2.z
26: mul r1.x, r1.x, r1.x
27: mul r2, r8, t0
28: mul r3, r9, t0
29: cmp r1, -r1.x, r2, r3
30: add r2.x, t1.z, c2.w
31: mul r2.x, r2.x, r2.x
32: texld r3, t1, s12
32: texld r4, t1, s13
32: texld r5, t1, s8
32: texld r6, t1, s9
32: mul r3, r3, t0
33: mul r4, r4, t0
34: cmp r2, -r2.x, r3, r4
35: add r3.x, -t1.z, c2.y
36: cmp r1, r3.x, r1, r2
37: add r2.x, t1.z, c2.x
38: mul r2.x, r2.x, r2.x
39: mul r3, r5, t0
40: mul r4, r6, t0
41: cmp r2, -r2.x, r3, r4
42: add r3.x, -t1.z, c1.w
43: cmp r1, r3.x, r2, r1
44: add r2.x, -t1.z, c0.x
45: cmp r0, r2.x, r0, r1
46: mov oC0, r0

// approximately 61 instruction slots used (14 texture, 47 arithmetic)
ps_4_0
dcl_constantbuffer CB0[1], immediateIndexed
dcl_sampler s0, mode_default
dcl_sampler s1, mode_default
dcl_sampler s2, mode_default
dcl_sampler s3, mode_default
dcl_sampler s4, mode_default
dcl_sampler s5, mode_default
dcl_sampler s6, mode_default
dcl_sampler s7, mode_default
dcl_sampler s10, mode_default
dcl_sampler s11, mode_default
dcl_sampler s12, mode_default
dcl_sampler s13, mode_default
dcl_sampler s14, mode_default
dcl_sampler s15, mode_default
dcl_resource_texture2d (float,float,float,float) t0
dcl_resource_texture2d (float,float,float,float) t1
dcl_resource_texture2d (float,float,float,float) t2
dcl_resource_texture2d (float,float,float,float) t3
dcl_resource_texture2d (float,float,float,float) t4
dcl_resource_texture2d (float,float,float,float) t5
dcl_resource_texture2d (float,float,float,float) t6
dcl_resource_texture2d (float,float,float,float) t7
dcl_resource_texture2d (float,float,float,float) t10
dcl_resource_texture2d (float,float,float,float) t11
dcl_resource_texture2d (float,float,float,float) t12
dcl_resource_texture2d (float,float,float,float) t13
dcl_resource_texture2d (float,float,float,float) t14
dcl_resource_texture2d (float,float,float,float) t15
dcl_input_ps linear v1.xyzw
dcl_input_ps linear v2.xyz
dcl_output o0.xyzw
dcl_temps 1
0: ge r0.x, l(7.000000), v2.z
1: if_nz r0.x
2: ge r0.x, l(3.000000), v2.z
3: if_nz r0.x
4: ge r0.x, l(1.000000), v2.z
5: if_nz r0.x
6: eq r0.x, v2.z, l(0.000000)
7: if_nz r0.x
8: sample r0.xyzw, v2.xyxx, t0.xyzw, s0
9: mul o0.xyzw, r0.xyzw, v1.xyzw
10: ret
11: else
12: sample r0.xyzw, v2.xyxx, t1.xyzw, s1
13: mul o0.xyzw, r0.xyzw, v1.xyzw
14: ret
15: endif
16: else
17: eq r0.x, v2.z, l(2.000000)
18: if_nz r0.x
19: sample r0.xyzw, v2.xyxx, t2.xyzw, s2
20: mul o0.xyzw, r0.xyzw, v1.xyzw
21: ret
22: else
23: sample r0.xyzw, v2.xyxx, t3.xyzw, s3
24: mul o0.xyzw, r0.xyzw, v1.xyzw
25: ret
26: endif
27: endif
28: else
29: ge r0.x, l(5.000000), v2.z
30: if_nz r0.x
31: eq r0.x, v2.z, l(4.000000)
32: if_nz r0.x
33: sample r0.xyzw, v2.xyxx, t4.xyzw, s4
34: mul o0.xyzw, r0.xyzw, v1.xyzw
35: ret
36: else
37: sample r0.xyzw, v2.xyxx, t5.xyzw, s5
38: mul o0.xyzw, r0.xyzw, v1.xyzw
39: ret
40: endif
41: else
42: eq r0.x, v2.z, l(6.000000)
43: if_nz r0.x
44: sample r0.xyzw, v2.xyxx, t6.xyzw, s6
45: mul o0.xyzw, r0.xyzw, v1.xyzw
46: ret
47: else
48: sample r0.xyzw, v2.xyxx, t7.xyzw, s7
49: mul o0.xyzw, r0.xyzw, v1.xyzw
50: ret
51: endif
52: endif
53: endif
54: else
55: ge r0.x, l(11.000000), v2.z
56: if_nz r0.x
57: eq r0.x, v2.z, l(10.000000)
58: if_nz r0.x
59: sample r0.xyzw, v2.xyxx, t10.xyzw, s10
60: mul o0.xyzw, r0.xyzw, v1.xyzw
61: ret
62: else
63: sample r0.xyzw, v2.xyxx, t11.xyzw, s11
64: mul o0.xyzw, r0.xyzw, v1.xyzw
65: ret
66: endif
67: else
68: ge r0.x, l(13.000000), v2.z
69: if_nz r0.x
70: eq r0.x, v2.z, l(12.000000)
71: if_nz r0.x
72: sample r0.xyzw, v2.xyxx, t12.xyzw, s12
73: mul o0.xyzw, r0.xyzw, v1.xyzw
74: ret
75: else
76: sample r0.xyzw, v2.xyxx, t13.xyzw, s13
77: mul o0.xyzw, r0.xyzw, v1.xyzw
78: ret
79: endif
80: else
81: eq r0.x, v2.z, l(14.000000)
82: if_nz r0.x
83: sample r0.xyzw, v2.xyxx, t14.xyzw, s14
84: mul o0.xyzw, r0.xyzw, v1.xyzw
85: ret
86: else
87: sample r0.xyzw, v2.xyxx, t15.xyzw, s15
88: mul o0.xyzw, r0.xyzw, v1.xyzw
89: ret
90: endif
91: endif
92: endif
93: endif
94: ret
// Approximately 0 instruction slots used

What do you do when there is less then 16 textures wont it error out due to the aggressive optimization on the fx files.

Im so bad with shaders in monogame, i can’t even test it. but i guess since all the textures are loaded you could do sort of texture splatting.

You could probably get around using if’s at all by just turning the conditionals into a straight zero multiply operation via a dot and trunc
With a extra value on the vertice structure, though i dunno if that’s any better then a if. 15 of the textures would by adding 0 just to get one textures full value.

1 Like