Android: Polygon Slowdown

Monkey Forums/Monkey Programming/Android: Polygon Slowdown

dragon

(Posted 2012) [#1]

This is not really a bug,
but a known issue on ANDROID devices without CPU floating point support. I have here a Galaxy Tab 7 and can see it!

The Problem is simple:
Any floating point number is calculated per software (Java or so), if here is no CPU support. And this is very SLOOOOOOOOOOOOOW!

http://www.badlogicgames.com/wiki/index.php/Direct_Bulk_FloatBuffer.put_is_slow

Imagine, you want to draw ovals, so you need many polygons.

I did a test:
drawing only 72 images: i get 60 fps
drawing only 72 ovals (80x80), i get 20 fps (!!!!!)

I could fix this on android (JAVA, not monkey) with some tricks:
1) i use always int numbers instead floats for polygons
2) i use a buffer where i store so many polys as possible and push it to device

PseudoCode for quads:

Quad(x, y, u, v) 'set edges 1 to 4
Quad(x, y, u, v)
Quad(x, y, u, v)
Quad(x, y, u, v)
...
Flush()

Flush should autom. called, at Flip or when buffer is full.
Or Flush can called autom. at some points

@MARKSIBLY: is here any chance to add this to mojo?
We need many int commands for DrawRect, DrawOval, DrawImage etc...
This is very important...

Java Sample Code:

package test;
import java.nio.ByteOrder;
import java.nio.ByteBuffer;
import java.nio.ShortBuffer;
import javax.microedition.khronos.opengles.GL10;



public class Draw
{

	///////////////////////////////////////////////////////////////////////////

	protected static short[] indexarray;
	protected static short[] vertexarray;
	protected static short[] texturearray;
	protected static byte[]  colorarray;

	protected static ShortBuffer indexbuffer;
	protected static ShortBuffer vertexbuffer;
	protected static ShortBuffer texturebuffer;
	protected static ByteBuffer  colorbuffer;
	
	protected static final int vertexmax = 2048; //512 quads x 4 vertices
	protected static int vertexcount;
	protected static int indexcount;

	protected static byte red   = (byte)255;
	protected static byte green = (byte)255;
	protected static byte blue  = (byte)255;
	protected static byte alpha = (byte)255;
	
	///////////////////////////////////////////////////////////////////////////

	public static void Update ()
	{
		ByteBuffer b;

	    	if (indexarray   == null) indexarray   = new short[vertexmax / 4 * 6]; //i
    		if (vertexarray  == null) vertexarray  = new short[vertexmax * 2]; //xy
	    	if (texturearray == null) texturearray = new short[vertexmax * 2]; //uv
    		if (colorarray   == null) colorarray   = new byte [vertexmax * 4]; //rgba

   		if (indexbuffer == null)
   		{
		    	b = ByteBuffer.allocateDirect((vertexmax / 4 * 6) * 2);
		    	b.order(ByteOrder.nativeOrder());
	    		indexbuffer = b.asShortBuffer();
   		}

   		if (vertexbuffer == null)
   		{
			b = ByteBuffer.allocateDirect(vertexmax * 2 * 2);
		    	b.order(ByteOrder.nativeOrder());
	    		vertexbuffer = b.asShortBuffer();
   		}
    	
	    	if (texturebuffer == null)
    		{
	   		b = ByteBuffer.allocateDirect(vertexmax * 2 * 2);
	    		b.order(ByteOrder.nativeOrder());
		    	texturebuffer = b.asShortBuffer();
    		}

	    	if (colorbuffer == null)
    		{
	    		colorbuffer = ByteBuffer.allocateDirect(vertexmax * 4 * 1);
    		}
    	
	    	if (View.gl != null)
    		{
			View.gl.glEnableClientState(GL10.GL_VERTEX_ARRAY);
			View.gl.glVertexPointer(2, GL10.GL_SHORT, 0, vertexbuffer);

			View.gl.glEnableClientState(GL10.GL_TEXTURE_COORD_ARRAY);
			View.gl.glTexCoordPointer(2, GL10.GL_SHORT, 0, texturebuffer);

			View.gl.glEnableClientState(GL10.GL_COLOR_ARRAY);
			View.gl.glColorPointer(4, GL10.GL_UNSIGNED_BYTE, 0, colorbuffer);
	    	}
    	
		vertexcount = 0;
		indexcount = 0;
    }
	
	///////////////////////////////////////////////////////////////////////////
	
	public static void Flush ()
	{
		if (vertexcount < 4) return;

		indexbuffer.put(indexarray, 0, indexcount);
		vertexbuffer.put(vertexarray, 0, vertexcount * 2);
		texturebuffer.put(texturearray, 0, vertexcount * 2);
		colorbuffer.put(colorarray, 0, vertexcount * 4);

	    	indexbuffer.position(0);
    		vertexbuffer.position(0);
	    	texturebuffer.position(0);
    		colorbuffer.position(0);

	    	View.gl.glDrawElements(GL10.GL_TRIANGLES, indexcount, GL10.GL_UNSIGNED_SHORT, indexbuffer);

		vertexcount = 0;
		indexcount = 0;
	}
	
	///////////////////////////////////////////////////////////////////////////

	public static void Quad (int x, int y, int u, int v)
	{
		vertexarray[vertexcount * 2] = (short)x;
		vertexarray[vertexcount * 2 + 1] = (short)y;

		texturearray[vertexcount * 2] = (short)u;
		texturearray[vertexcount * 2 + 1] = (short)v;

		//if (vertexcount % 4 == 2)
		//{
			colorarray[vertexcount * 4] = red;
			colorarray[vertexcount * 4 + 1] = green;
			colorarray[vertexcount * 4 + 2] = blue;
			colorarray[vertexcount * 4 + 3] = alpha;
		//}

		if (vertexcount % 4 != 3) //point 1-3
		{
			indexarray[indexcount] = (short)vertexcount;
			indexcount++;
		}
		else //point 4
		{
			indexarray[indexcount]     = (short)(vertexcount-2);
			indexarray[indexcount + 1] = (short)vertexcount;
			indexarray[indexcount + 2] = (short)(vertexcount-1);
			indexcount = indexcount + 3;
		}

		vertexcount++;
		if (vertexcount == vertexmax) Flush();
	}

	///////////////////////////////////////////////////////////////////////////

	public static void Quad (int x, int y, int u, int v, int red, int green, int blue, int alpha)
	{
		Draw.red   = (byte)red;
		Draw.green = (byte)green;
		Draw.blue  = (byte)blue;
		Draw.alpha = (byte)alpha;
		Quad(x, y, u, v);
	}

	///////////////////////////////////////////////////////////////////////////
	
	public static void Color (int red, int green, int blue, int alpha)
	{
		Draw.red   = (byte)red;
		Draw.green = (byte)green;
		Draw.blue  = (byte)blue;
		Draw.alpha = (byte)alpha;
	}

}

AdamRedwoods

(Posted 2012) [#2]

Note that Ovals are made up of many triangles, so there's more overhead. Shouldn't be much, though.

I have a Galaxy Tab P1010 and it is about the expected speed (slow, but manageable). PowerVR SGX530 and CortexA-8 have FPU.

Also try running Linpack to get accurate MFlops if you are curious.

In my tests, Math.Sin and Math.Cos really slow down Android devices. I'd create a lookup table and that may speed up oval drawing. (minimum 12 segments x2 sin/cos calculations x72 = 1728 cosine calculations per frame... slow)

marksibly

(Posted 2012) [#3]

Moving this to programming...

We went through a lot of this in the early days of Monkey - a lot of links out there (eg: the "FloatBuffer.put is slow" one) are dated, and turned out to only apply to a subset of very early android devices.

After much trial and error, we managed to get Monkey performance up to about par with the other 2D libs out there - ie: they're all probably hardware limited. There are tradeoffs - some approaches work better on some HW than others - but in general I'm happy with Mojo's performance on Android.

Mojo does buffer rendering - check out the source - but not ovals because they can consume a lot of vertices. This could possibly be improved, but quads, points, lines are all 'batched'. Also, there IS overhead per triangle, so ovals will always be 'slowish'.

AdamRedwoods

(Posted 2012) [#4]

Also.... it looks like mojo is using java.lang.Math which converts doubles to floats.

Hm, maybe use FloatMath? It says it takes 1/3 the time. I may think about using this in miniB3D as well...
http://developer.android.com/reference/android/util/FloatMath.html

marksibly

(Posted 2012) [#5]

Hi,

> Hm, maybe use FloatMath?

Nice find! Could be the way to go for Sin/Cos etc...

Although this is a worry:

http://code.google.com/p/android/issues/detail?id=36199

Perhaps it uses 'fast' software emulation instead of the FPU...?

This seems to agree:

http://www.badlogicgames.com/wordpress/?p=796

So what does this mean? FloatMath works a tiny bit better on older devices with no FPU as expected. On never devices with FPU it’s actually slower it seems due to load conversions to and from the FPU registers for 32-bit floats. Function calls are still evil and nasty even with the JIT.

dragon

(Posted 2012) [#6]

in my tab i have nearly same HW, but a bit better Video-Chip

ARM Cortex-A8
PowerVR-SGX540

more details about CPU:
http://en.wikipedia.org/wiki/ARM_Cortex-A8
http://en.wikipedia.org/wiki/Samsung_Exynos#Exynos_3110

FPU on ARM-A8 is optional, i could not find out if it is integrated

i can check out later what is the problem:
-float generally
-or openGL float

dragon

(Posted 2012) [#7]

Mojo does buffer rendering - check out the source

i checked it... yes mojo use buffers,
but i see that matrix calculations are done per software and not per hardware using openGL commands (glRotate, glScale etc...)

best way for me is use gl-commands for transform and then push raw int vertices without software matrices calculations

and this should run fine on any platform - or?

marksibly

(Posted 2012) [#8]

Hi,

> but i see that matrix calculations are done per software and not per hardware using openGL commands (glRotate, glScale etc...)

This is about the slowest way you can do 2D rendering.

It means you can't batch primitives - because they must be drawn/flushed to be rotated/scaled etc - which means many more calls to OS/GL functions AND a great reduction in what can be done in parallel with the GPU.

It's also not supported in GL2.0, which isn't a problem right now but..

AdamRedwoods

(Posted 2012) [#9]

Function calls are still evil and nasty even with the JIT.

ok. darn. thought maybe they had implemented LUTs.

dragon

(Posted 2012) [#10]

This is about the slowest way you can do 2D rendering.
It means you can't batch primitives - because they must be drawn/flushed to be rotated/scaled etc - which means many more calls to OS/GL functions AND a great reduction in what can be done in parallel with the GPU.

Mark, i think you don't understand what i mean...
Here are 2 situations:

1) many objects with different rotation, scale...
2) many objects width same rotation, scale...

Your code is effective for first situation.
It is OK to use floats, because you MUST
calculate matrices using floats

For second situation, i think it is not efficient.

If i setup a screen rotation and scale once
(how many gl-calls are this? only few),
then i could draw massive count of polys with same matrix.
But not per software each time using floats.
Instead of this i CAN NOW use ints only (!) and ignoring matrix
floating calculations and i can push many many more int
vertices to device.

And you CAN batch primitives - so long nothing changes on matrix!!!

The problem with older androids < 3.0 is not drawing polygons,
the problem is to put all data to device - this is much slower than drawing!!! and this is incredible! Here is bottleneck somewhere...

I think it is possible to mix both situations with only one code...
You can add something like a "switch".
So long you use old float DrawImage etc, you use old code #1 and batch in old float buffers. In special cases, when you use Int DrawImage etc, you can use new code #2 and batch in new int buffer (when you flush data, you must temporary change hardware device matrix, because both matrices situations are different!). Do you understand what i mean?

Situation 1:
-calculate vertices using matrices per software
-save this data in float buffer
-push float buffer to device without changing device matrix

Situation 2:
-save data in int buffer without change
-change device matrix
-push int buffer to device
-change device matrix back to identity to keep compatibility with old code

You see that situation 2 work good only with batched polys,
so matrix change should happen very seldom.

It's also not supported in GL2.0, which isn't a problem right now but..

yes, in GL2 you must use shaders
this is a bit different...
i don't like this also...
because you need separate flush code for GL1 and GL2,

http://mobileorchard.com/getting-started-with-opengl-es-20-on-the-iphone-3gs/

AdamRedwoods

(Posted 2012) [#11]

So what you are saying is that you would want a situation #2 added?
And situation #2 is where the vertices are being modified by a global matrix (device matrix), rather than individual matrixes?

Could you use a write pixels to image for this? Are you using this for a tiled background?

dragon

(Posted 2012) [#12]

i modify mojo a bit and add int buffer to test it
most code is from above - it have only a very simple buffer for one texture type and only for images

so here is the benchmark result:

'number of images (16x16 with some alpha) to slowdown app from 60 fps to 30 fps / window 1024x600
'---------------------------
'HTML5: 2600
'flash: 3700
'glfw: 66000
'xna: 130000
'android: 6600
'android: 13000 (int buffer!)

as you can see i could DOUBLE the speed on android with that code
the trick is to use SHORT buffer for vertex & texture

texture is a bit tricky - it must scaled up with matrix (not done yet) to get range over 0..1

Transformations from monkey are not used yet (only a dummy), but openGL have already a transformation applied and calculate with it.

here is benchmark code without mojo modifications:

Strict
Import mojo

Global nr:Int = 13000

Class Program Extends App

	Global img:Image
	Global frames:Int
	Global time:Int
	
	Method OnCreate:Int ()
		SetUpdateRate(60)
		img = LoadImage("icon.png") '16x16
		Return 0
	End

	Method OnUpdate:Int ()
		Return 0
	End

	Method OnRender:Int ()
		Local w:Int = DeviceWidth() - img.Width()
		Local h:Int = DeviceHeight() - img.Height()
		Local i:Int,x:Int,y:Int

		Cls(255,0,255)
		For i = 1 To nr
			DrawImage(img, x, y)
			'DrawImageX(img, x, y, 16, 16, 0, 0, 1, 1)
			x = x + img.Width()
			If x > w Then x=0; y=y+img.Height()
			If y > h Then y=0
		End
		'Flush2()
		
		If frames = 0 Then time = Millisecs()
		Local fps:Int = Float(frames) / (Float(Millisecs() - time) / 1000.0)
		DrawText("fps:" + String(fps), 10, 10)
		frames = frames + 1

		Return 0
	End

End

Function Main:Int ()
	New Program()
	Return 0
End

AdamRedwoods

(Posted 2012) [#13]

using a short buffer leads to half the amount of bytes being transferred.

marksibly

(Posted 2012) [#14]

Hi,

I had a hack at short buffers this afternoon, and it does seem to make a significant difference (in bouncyaliens anyway) on my Nexus7.

See bottom of product updates section for an experimental version - rename to mojo.android.java and drop it in modules/mojo/native.

Some issues:

I don't see how gl transforms can help unless your talking about one transform for a bunch of primitives. Even then, I think short buffers will generally need some sort of CPU transform regardless...

The issue is that for int vertices to work, they really need to be in device/pixel space when they're written to the buffers. People frequently use transforms to implement virtual resolutions, so unless you're OK with stuff 'jumping' around depending on device res vs virtual res, this transform needs to be done first.

Another alternative would be to use some kind of fixed point system, but it's hard to do this in a general way - eg: how would it support an app that decided to map the entire display to a virtual size of 1 x 1? As it is, I've implemented 4 bits 'fraction' in the above anyway because I don't like the idea of losing subpixel accuracy completely.

This has also required some extra code in DrawBlah to throw away stuff that overflows a short, but this surprisingly is still a significant win in bouncyaliens. What the hell are those drivers up to?!?

Anyway, have a play and test with as much as you can!

dragon

(Posted 2012) [#15]

thx mark,

i did again some speed test - this time with floats,
i could speed up mojo drawimage

you should look at function
DrawImage & Co

i do not understand why you use "If context.tformed" here.
it make no sense...

i replaced DrawImage with new one and get 25% speed boost:
images are normaly transformed - i do not see any difference

Function DrawImage( image:Image,x#,y#,frame=0 )
#If CONFIG="debug"
	DebugRenderDevice
#End
	Local f:Frame=image.frames[frame]
	context.Validate
	If image.flags & Image.FullFrame
		renderDevice.DrawSurface image.surface,x-image.tx,y-image.ty
	Else
		renderDevice.DrawSurface2 image.surface,x-image.tx,y-image.ty,f.x,f.y,image.width,image.height
	Endif
End

I also add only few more lines to mojo to temporary switch to a new mode
and get 36% more speed - compared to normal drawimage

'1) replace above code in graphics.monkey

'2) add this to graphicsdevice.monkey:
	Method DrawMode( mode:Bool )

'3) add this to graphics.monkey:
Function DrawMode( mode:Bool )
#If CONFIG="debug"
	DebugRenderDevice
#End
	context.Validate
	renderDevice.DrawMode mode
End

and add this diff to mojo.android.java:

--- C:/Prog/Prog/MonkeyPro66/modules/mojo_backup/native/mojo.android.java	Tue Sep 25 14:44:22 2012
+++ C:/Prog/Prog/MonkeyPro66/modules/mojo/native/mojo.android.java	Thu Oct 04 21:41:55 2012
@@ -607 +607,2 @@
-	
+	boolean drawmode;
+
@@ -638,0 +640,9 @@
+	int DrawMode( boolean mode ){
+		if (drawmode!=mode)
+		{
+			Flush();
+			drawmode=mode;
+		}
+		return 0;
+	}
+	
@@ -678,0 +689 @@
+		drawmode=false;
@@ -707,0 +719,9 @@
+		if (drawmode==true && tformed==true){
+			float m[]={
+			ix,iy,0,0,
+			jx,jy,0,0,
+			0,0,1,0,
+			tx,ty,0,1};
+			GLES11.glMultMatrixf(m, 0);
+		}
+		
@@ -743,0 +764,2 @@
+
+		if (drawmode==true && tformed==true) GLES11.glLoadIdentity();
@@ -853 +875 @@
-	
+		if (drawmode==true) Flush();
@@ -1028 +1050 @@
-		if( tformed ){
+		if( tformed==true && drawmode==false ){
@@ -1065 +1087 @@
-		if( tformed ){
+		if( tformed==true && drawmode==false ){

this speed up transformed images only

marksibly

(Posted 2012) [#16]

Hi,

>i do not understand why you use "If context.tformed" here.
>it make no sense...

I think you're right! I believe it's the 'old code' effect...

Will do some more testing but this looks like a nice improvement.

How is DrawMode used though? Is there anyway it could be 'autodetected', ie: without an extra command?

AdamRedwoods

(Posted 2012) [#17]

i think he's using DrawMode to switch to the glMatrix.

dragon

(Posted 2012) [#18]

Enable special mode with "DrawMode(true)"
this is good for many polys with same transformation
that give few % more speed

disable with "DrawMode(false)"

i do not think you can autodetect that
this is what user should do
false-mode calc matrices per software for each vertex
true-mode cals matrices per hardware