The weird art of optimization

In today’s update we discuss a bit about a recent find on PlayStation micro-otpimizations and unexpected results. Full news after the jump.

So, the other day I was testing some optimized code to render polygons with the old procedures I’ve been using since day 1 of programming with Squeeze Bomb. Let’s see in detail what it does, starting from macros:

// a few extra macros for faster code, needs testing
// these don't abuse the stack to store GTE calculation results
#define gte_stotz_m( r0 ) __asm__ volatile (	\
	"mfc2	%0, $7;"		\
	: "=r"( r0 )			\
	: )

#define gte_stflg_m( r0 ) __asm__ volatile (	\
	"mfc2	%0, $31;"		\
	: "=r"( r0 )			\
	: )

#define gte_stopz_m( r0 ) __asm__ volatile (	\
	"mfc2	%0, $24;"		\
	: "=r"( r0 )			\
	: )

// direct access to POLY_GT4.rgb3
#define gte_strr3_gt4( r0 ) __asm__ volatile (	\
	"swc2	$22, 40( %0 );"	\
	:						\
	: "r"( r0 )				\
	: "memory" )

// direct access to POLY_GT4.xy3
#define gte_stsxy_gt4_3( r0 ) __asm__ volatile (\
	"swc2	$14, 0x2C( %0 )"\
	:						\
	: "r"( r0 )				\
	: "memory" )

The following is the actual rendering code for tris and quads:

void FastTG3L(void *ob, void *packet, CVECTOR *rgb, u32* ot)
{
	register u32 i, is, *tag;
#if !CRAZY
	IFO ifo;
#else
	register int otz;
#endif
	register POLY_GT3 *sx;
	const MD1_TRIANGLES *obj = (const MD1_TRIANGLES*)ob;
	const MD1_TRIANGLE *t = (const MD1_TRIANGLE*)obj->tri_offset;
	const SVECTOR *vp = (const SVECTOR*)obj->vertex_offset;
	const SVECTOR *vn = (const SVECTOR*)obj->normal_offset;

	rgb->cd = (rgb->cd & 3) | CODE_PGT3;
	gte_ldrgb(rgb);

	sx = (POLY_GT3*)packet;

	for (i = 0, is = obj->tri_count; i < is; t++)
	{
		POLY_GT3 *si;
		gte_ldv3(&vp[t->v0], &vp[t->v1], &vp[t->v2]);	/* load model vertices */
		i++;
		si = sx;
		gte_rtpt_b();					/* perspective */

#if !CRAZY
		gte_stflg(&ifo.flg);			/* store flag */
		if (ifo.flg & GTEFLG_ERROR) { sx += 2; continue; }
#else
		gte_stflg_m(otz);
		if (otz & GTEFLG_ERROR) { sx += 2; continue; }
#endif
		gte_nclip_b();					/* normal clipping */
#if !CRAZY
		gte_stopz(&ifo.otz);			/* return orientation */
		if (ifo.otz <= 0) { sx += 2; continue; }
#else
		gte_stopz_m(otz);
		if (otz <= 0) { sx += 2; continue; }
#endif
		gte_stsxy3_gt3(si); /* store transformed result */
		sx += 2;
		gte_nop();
		gte_avsz3_b(); /* calculate depth */
#if !CRAZY
		gte_stotz(&ifo.otz); /* get depth */
		if (!(ifo.otz >> 6)) continue;	/* skip if it's too low or too high */
#else
		gte_stotz_m(otz);
		if (!(otz >> 6)) continue;
#endif

		gte_ldv3(&vn[t->n0], &vn[t->n1], &vn[t->n2]);	/* set lighting */
#if !CRAZY
		tag = &ot[ifo.otz >> 4];
#else
		tag = &ot[otz >> 4];
		si->tag = (*tag & 0x00FFFFFF) | 0x09000000;
#endif
		gte_ncct_b();								/* calculate */
		gte_strgb3_gt3(si);							/* store rgb values */

		// sort!!
#if !CRAZY
		si->tag = (*tag & 0x00FFFFFF) | 0x09000000;
#endif
		*tag = (u32)si & 0x00FFFFFF;
	}
}

void FastTG4L(void *ob, void *packet, CVECTOR *rgb, u32* ot)
{
	register u32 i, is, *tag;
#if !CRAZY
	IFO ifo;
#else
	int otz, flg;
#endif
	register POLY_GT4 *sx;
	const MD1_QUADS *obj = (const MD1_QUADS*)ob;
	const MD1_QUAD *q = (const MD1_QUAD*)obj->quad_offset;
	const SVECTOR *vp = (const SVECTOR*)obj->vertex_offset;
	const SVECTOR *vn = (const SVECTOR*)obj->normal_offset;

	rgb->cd = (rgb->cd & 3) | CODE_PGT4;
	gte_ldrgb(rgb);

	sx = (POLY_GT4*)packet;

	for (i = 0, is = obj->quad_count; i < is; q++)
	{
		POLY_GT4 *si;
		gte_ldv3(&vp[q->v0], &vp[q->v1], &vp[q->v2]);
		si = sx;
		i++;
		gte_rtpt_b();			/* RotTransPers3 */

#if !CRAZY
		gte_stflg(&ifo.flg0);
		if (ifo.flg0 & GTEFLG_ERROR) { sx += 2; continue; }
		gte_nclip_b();			/* NormalClip */
		gte_stopz(&ifo.otz);	/* back clip */
		if (ifo.otz <= 0) { sx += 2; continue; }	/* flipped, skip */
#else
		gte_stflg_m(flg);
		if (flg & GTEFLG_ERROR) { sx += 2; continue; }
		gte_nclip_b();			/* NormalClip */
		gte_stopz_m(otz);	/* back clip */
		if (otz <= 0) { sx += 2; continue; } /* flipped, skip */
#endif
		gte_stsxy3_gt4((u_long *)si); gte_ldv0(&vp[q->v3]);
		sx += 2;
		gte_nop();
		gte_rtps_b();			/* RotTransPers */
#if !CRAZY
		gte_stflg(&ifo.flg);
		if (ifo.flg & GTEFLG_ERROR) continue;
#else
		gte_stflg_m(flg);
		if (flg & GTEFLG_ERROR) continue;
#endif

		gte_stsxy_gt4_3(si);
		gte_avsz4();
#if !CRAZY
		gte_stotz(&ifo.otz);
		// limit range
		if (!(ifo.otz >> 6)) continue;
#else
		gte_stotz_m(otz);
		if (!(otz >> 6)) continue;
#endif

		gte_ldv3(&vn[q->n0], &vn[q->n1], &vn[q->n2]);
#if !CRAZY
		tag = &ot[ifo.otz >> 4];
#else
		tag = &ot[otz >> 4];
#endif
		gte_ncct_b();
		gte_strgb3_gt4(si);

		gte_ldv0(&vn[q->n3]);
		si->tag = (*tag & 0x00FFFFFF) | 0x0C000000;
		gte_nccs_b();
		gte_strr3_gt4(si);

		// sort!!
		*tag = (u32)si & 0x00FFFFFF;
	}
}

If you are familiar with inline assembly and how the stack works, you can probably notice two minor differences that can translate to better performance. The macros above tend to change one very stupid behavior of Sony’s original tricks to retrieve GTE registers, which were previously stored in memory rather than registers. The code activated via CRAZY = TRUE is the one that uses register direct copies, while the other case defaults to stack writes. It’s not exactly the biggest change ever, but it avoids any unnecessary access to memory, which is a great penalty on the PlayStation.

At first I thought the code wouldn’t work because Sony made the macros work with memory as the only mean to access GTE registers, but apparently there are no differences whatsoever in behavior when you use mfc2 (possibly cfc2 too) instead of swc2. I’m still not sure how much this improves the general performance, but it could be enough to prevent any future lag. Similarly, the new macros to access POLY_GT3 and POLY_GT4 diffuse attributes does a little more optimization, even tho it’s not that great; all it does is performing straight access on the structures rather than creating temp register values for each attribute.

(Visited 19 times, 1 visits today)

5 opinions on “The weird art of optimization”

  1. Dude you´re best ! Keep it up master!
    I visit your website almost daily and i dont have any clue what you´re talking about in this post but i still read it because you and your project are fucking AWESOME!

    Best regards
    MTD3

  2. Hi bros! Nice work. Please when you finish it remember the PSP users (like me :D) and create, or at least give us instructions on how to make an eboot.pbp file for playing RE 1.5 on the PSP.

    1. I’m not sure if it is still the case, but when I had a modded PSP ages ago I just used popstation converter to convert PS1 ISOs to eboot. Then if there are any compatability issues with the game on PSP there is a way you can add versions of the popstation emulator from all firmware versions to your memory stick root and flick between them to see which works best for each PS1 game. I may be remembering a couple things wrong because it’s been years, but I recall as long as you have the ability to flick between them you can almost always find a version with compatability.

Leave a Reply

Your email address will not be published. Required fields are marked *