diff --git a/Makefile b/Makefile index c389c6e7..4d5fcfef 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ TOPDIR = $(realpath .) -SUBDIRS = tools lib system effects +SUBDIRS = tools lib system effects demo EXTRA-FILES = tags cscope.out CLEAN-FILES = bootloader.bin a500rom.bin addchip.bootblock.bin vbrmove diff --git a/build/effect.mk b/build/effect.mk index 6e0ef76a..ded89ef4 100644 --- a/build/effect.mk +++ b/build/effect.mk @@ -17,6 +17,7 @@ VBRMOVE = $(TOPDIR)/vbrmove EXTRA-FILES += $(DATA_GEN) $(EFFECT).adf CLEAN-FILES += $(DATA_GEN) $(EFFECT).exe $(EFFECT).exe.dbg $(EFFECT).exe.map +CLEAN-FILES += $(EFFECT).img $(EFFECT).rom ifeq ($(AMIGAOS), 0) EXTRA-FILES += $(EFFECT).img $(EFFECT).rom diff --git a/demo/Makefile b/demo/Makefile new file mode 100644 index 00000000..f0645fb7 --- /dev/null +++ b/demo/Makefile @@ -0,0 +1,69 @@ +TOPDIR := $(realpath ..) + +DELTA := 1 + +DEMONAME = Demo +MODULE := JazzCat-ElectricLifeforms + +OBJECTS := data/$(MODULE).trk.o + +ifeq ($(DELTA), 1) +OBJECTS += data/$(MODULE)-Delta.smp.o +else +OBJECTS += data/$(MODULE).smp.o +endif + +LIBS := libpt + +CLEAN-FILES := data/$(MODULE)*.o data/$(MODULE).{smp,trk}* data/loader.c + +SOURCES := main.c load.c empty.c +MAIN := # + +CPPFLAGS += -DDELTA=$(DELTA) -I. + +PNG2C.loader := --bitmap loader,64x64x1 --palette loader,2 + +include $(TOPDIR)/build/effect.mk + +data/%.trk data/%.smp: data/%.mod + $(PYTHON3) $(TOPDIR)/effects/playpt/data/ptsplit.py $^ + mv data/$*.mod.trk data/$*.trk + mv data/$*.mod.smp data/$*.smp + +OBJCOPY-CHIP := --rename-section .data=.datachip,alloc,load,data,contents + +data/%.trk.o: data/%.trk + @echo "[OBJCOPY] $(DIR)$< -> $(DIR)$@" + $(OBJCOPY) -I binary -O amiga \ + --redefine-sym _binary_data_$(subst -,_,$*)_trk_start=_Module \ + --redefine-sym _binary_data_$(subst -,_,$*)_trk_end=_ModuleEnd \ + --redefine-sym _binary_data_$(subst -,_,$*)_trk_size=_ModuleSize \ + $^ $@ + +data/%-Delta.smp: data/%.smp + @echo "[DELTA] $(DIR)$< -> $(DIR)$@" + $(PYTHON3) delta.py $< $@ + +data/%.smp.o: data/%.smp + @echo "[OBJCOPY] $(DIR)$< -> $(DIR)$@" + $(OBJCOPY) $(OBJCOPY-CHIP) -I binary -O amiga \ + --redefine-sym _binary_data_$(subst -,_,$*)_smp_start=_Samples \ + --redefine-sym _binary_data_$(subst -,_,$*)_smp_end=_SamplesEnd \ + --redefine-sym _binary_data_$(subst -,_,$*)_smp_size=_SamplesSize \ + $^ $@ + +data/%.o: data/%.txt + @echo "[OBJCOPY] $(DIR)$< -> $(DIR)$@" + $(OBJCOPY) -I binary -O amiga \ + --redefine-sym _binary_data_$(subst -,_,$*)_txt_start=_Text \ + --redefine-sym _binary_data_$(subst -,_,$*)_txt_end=_TextEnd \ + --redefine-sym _binary_data_$(subst -,_,$*)_txt_size=_TextSize \ + $^ $@ + +REPOHEAD := $(shell git rev-parse --short HEAD) + +%.exe.packed: %.exe + @echo "[PACK] $(DIR)$< -> $(DIR)$@" + Shrinkler -o -f 0xdff180 \ + -t "$(DEMONAME) by Ghostown (build: $(REPOHEAD))" $< $@ diff --git a/demo/data/.gitignore b/demo/data/.gitignore new file mode 100644 index 00000000..5872629b --- /dev/null +++ b/demo/data/.gitignore @@ -0,0 +1,2 @@ +*.c +*.delta diff --git a/demo/data/JazzCat-ElectricLifeforms.mod b/demo/data/JazzCat-ElectricLifeforms.mod new file mode 100644 index 00000000..9fb9579f --- /dev/null +++ b/demo/data/JazzCat-ElectricLifeforms.mod @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ba6ffb4b0c688a64f2058ae92c0261cc82aff4fdd41e3bb8d9059d256a47ab9 +size 193958 diff --git a/demo/data/demo.sync b/demo/data/demo.sync new file mode 100644 index 00000000..37c1ed90 --- /dev/null +++ b/demo/data/demo.sync @@ -0,0 +1,4 @@ +@track EffectNumber +$0000 0 !step +$1000 -1 +@end diff --git a/demo/data/loader.png b/demo/data/loader.png new file mode 100644 index 00000000..d0bd65f0 --- /dev/null +++ b/demo/data/loader.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a212e37e5712290c908d9faf506d7e34cab3995da1691addccc74d43202086de +size 458 diff --git a/demo/delta.py b/demo/delta.py new file mode 100755 index 00000000..0b173aea --- /dev/null +++ b/demo/delta.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import argparse +from array import array + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Encode 8-bit samples with delta encoding.') + parser.add_argument( + 'decoded', metavar='DECODED', type=str, help='Decoded samples file.') + parser.add_argument( + 'encoded', metavar='ENCODED', type=str, help='Encoded samples file.') + args = parser.parse_args() + + data = array('B') + + with open(args.decoded, 'rb') as f: + data.frombytes(f.read()) + data.append(0) + + out = array('B') + for i in range(len(data) - 1): + out.append((data[i] - data[i - 1]) & 255) + + with open(args.encoded, 'wb') as f: + f.write(out) diff --git a/demo/demo.h b/demo/demo.h new file mode 100644 index 00000000..ba4682ef --- /dev/null +++ b/demo/demo.h @@ -0,0 +1,21 @@ +#ifndef __DEMO_H__ +#define __DEMO_H__ + +#include +#include +#include +#include + +short UpdateFrameCount(void); + +static inline short FromCurrKeyFrame(TrackT *track) { + return frameCount - CurrKeyFrame(track); +} + +static inline short TillNextKeyFrame(TrackT *track) { + return NextKeyFrame(track) - frameCount; +} + +void FadeBlack(const u_short *colors, short count, u_int start, short step); + +#endif /* !__DEMO_H__ */ diff --git a/demo/empty.c b/demo/empty.c new file mode 100644 index 00000000..417090de --- /dev/null +++ b/demo/empty.c @@ -0,0 +1,12 @@ +#include + +static void Init(void) { +} + +static void Kill(void) { +} + +static void Render(void) { +} + +EFFECT(Empty, NULL, NULL, Init, Kill, Render, NULL); diff --git a/demo/load.c b/demo/load.c new file mode 100644 index 00000000..01c722b1 --- /dev/null +++ b/demo/load.c @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include + +#define _SYSTEM +#include +#include + +#include "data/loader.c" + +#define X1 96 +#define Y1 (120 + 32) +#define X2 224 +#define Y2 (136 + 24) + +/* from 0 to 256 */ +static volatile short LoadProgress = 0; + +static int ProgressBarUpdate(void) { + static __code short x = 0; + short newX = LoadProgress >> 1; + for (; x < newX; x++) { + CpuLine(X1 + x, Y1, X1 + x, Y2); + } + return 0; +} + +INTSERVER(ProgressBarInterrupt, 0, (IntFuncT)ProgressBarUpdate, NULL); + +static void LoadData(void) { + while (LoadProgress < 256) { + LoadProgress++; + WaitVBlank(); + } +} + +#define WIDTH 320 +#define HEIGHT 256 +#define DEPTH 1 + +void LoadDemo(void) { + BitmapT *screen = NewBitmap(WIDTH, HEIGHT, DEPTH, BM_CLEAR); + CopListT *cp = NewCopList(40); + + CpuLineSetup(screen, 0); + CpuLine(X1 - 1, Y1 - 2, X2 + 1, Y1 - 2); + CpuLine(X1 - 1, Y2 + 1, X2 + 1, Y2 + 1); + CpuLine(X1 - 2, Y1 - 1, X1 - 2, Y2 + 1); + CpuLine(X2 + 2, Y1 - 1, X2 + 2, Y2 + 1); + + SetupPlayfield(MODE_LORES, DEPTH, X(0), Y(0), WIDTH, HEIGHT); + LoadColors(loader_colors, 0); + + EnableDMA(DMAF_BLITTER); + BitmapCopy(screen, (WIDTH - loader_width) / 2, Y1 - loader_height - 16, + &loader); + WaitBlitter(); + DisableDMA(DMAF_BLITTER); + + CopSetupBitplanes(cp, screen, DEPTH); + CopListFinish(cp); + CopListActivate(cp); + + EnableDMA(DMAF_RASTER); + + AddIntServer(INTB_VERTB, ProgressBarInterrupt); + LoadData(); + RemIntServer(INTB_VERTB, ProgressBarInterrupt); + + DisableDMA(DMAF_RASTER); + DeleteCopList(cp); + + DeleteBitmap(screen); +} diff --git a/demo/main.c b/demo/main.c new file mode 100644 index 00000000..26f94185 --- /dev/null +++ b/demo/main.c @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "demo.h" + +#define _SYSTEM +#include +#include + +extern u_char Module[]; +extern u_char Samples[]; +#if DELTA == 1 +extern u_char SamplesSize[]; +#endif + +extern EffectT EmptyEffect; + +short frameFromStart; +short frameTillEnd; + +#include "data/demo.c" + +static EffectT *AllEffects[] = { + &EmptyEffect, + NULL, +}; + +static void ShowMemStats(void) { + Log("[Memory] CHIP: %d FAST: %d\n", MemAvail(MEMF_CHIP), MemAvail(MEMF_FAST)); +} + +static void LoadEffects(EffectT **effects) { + EffectT *effect; + for (effect = *effects; effect; effect = *effects++) { + EffectLoad(effect); + if (effect->Load) + ShowMemStats(); + } +} + +static void UnLoadEffects(EffectT **effects) { + EffectT *effect; + for (effect = *effects; effect; effect = *effects++) { + EffectUnLoad(effect); + } +} + +void FadeBlack(const u_short *colors, short count, u_int start, short step) { + volatile short *reg = &custom->color[start]; + + if (step < 0) + step = 0; + if (step > 15) + step = 15; + + while (--count >= 0) { + short to = *colors++; + + short r = ((to >> 4) & 0xf0) | step; + short g = (to & 0xf0) | step; + short b = ((to << 4) & 0xf0) | step; + + r = colortab[r]; + g = colortab[g]; + b = colortab[b]; + + *reg++ = (r << 4) | g | (b >> 4); + } +} + +short UpdateFrameCount(void) { + short t = ReadFrameCounter(); + frameCount = t; + frameFromStart = t - CurrKeyFrame(&EffectNumber); + frameTillEnd = NextKeyFrame(&EffectNumber) - t; + return t; +} + +static volatile EffectFuncT VBlankHandler = NULL; + +static int VBlankISR(void) { + if (VBlankHandler) + VBlankHandler(); + return 0; +} + +INTSERVER(VBlankInterrupt, 0, (IntFuncT)VBlankISR, NULL); + +#define SYNCPOS(pos) (((((pos) & 0xff00) >> 2) | ((pos) & 0x3f)) * 3) + +static void RunEffects(void) { + /* Set the beginning of intro. Useful for effect synchronization! */ + short pos = 0; + + frameCount = SYNCPOS(pos); + SetFrameCounter(frameCount); + PtData.mt_SongPos = pos >> 8; + PtData.mt_PatternPos = (pos & 0x3f) << 4; + PtEnable = -1; + + AddIntServer(INTB_VERTB, VBlankInterrupt); + + for (;;) { + static short prev = -1; + short curr = TrackValueGet(&EffectNumber, frameCount); + + // Log("prev: %d, curr: %d, frameCount: %d\n", prev, curr, frameCount); + + if (prev != curr) { + if (prev >= 0) { + VBlankHandler = NULL; + EffectKill(AllEffects[prev]); + ShowMemStats(); + } + if (curr == -1) + break; + EffectInit(AllEffects[curr]); + VBlankHandler = AllEffects[curr]->VBlank; + ShowMemStats(); + Log("[Effect] Transition to %s took %d frames!\n", + AllEffects[curr]->name, ReadFrameCounter() - lastFrameCount); + lastFrameCount = ReadFrameCounter() - 1; + } + + { + EffectT *effect = AllEffects[curr]; + short t = UpdateFrameCount(); + if ((lastFrameCount != frameCount) && effect->Render) + effect->Render(); + lastFrameCount = t; + } + + prev = curr; + } + + RemIntServer(INTB_VERTB, VBlankInterrupt); +} + +#if DELTA == 1 +static void DecodeSamples(u_char *smp, int size) { + u_char data = *smp++; + short n = (size + 7) / 8 - 1; + short k = size & 7; + + Log("[Init] Decoding delta samples (%d bytes)\n", size); + + switch (k) { + case 0: do { data += *smp; *smp++ = data; + case 7: data += *smp; *smp++ = data; + case 6: data += *smp; *smp++ = data; + case 5: data += *smp; *smp++ = data; + case 4: data += *smp; *smp++ = data; + case 3: data += *smp; *smp++ = data; + case 2: data += *smp; *smp++ = data; + case 1: data += *smp; *smp++ = data; + } while (--n != -1); + } +} +#endif + +extern void LoadDemo(void); + +int main(void) { + /* NOP that triggers fs-uae debugger to stop and inform GDB that it should + * fetch segments locations to relocate symbol information read from file. */ + asm volatile("exg %d7,%d7"); + + ResetSprites(); + LoadDemo(); +#if DELTA == 1 + Log("[Init] Decoding samples\n"); + DecodeSamples(Samples, (int)SamplesSize); +#endif + PtInstallCIA(); + PtInit(Module, Samples, 0); + + { + TrackT **trkp = AllTracks; + while (*trkp) + TrackInit(*trkp++); + } + + LoadEffects(AllEffects); + + RunEffects(); + + PtEnd(); + PtRemoveCIA(); + + UnLoadEffects(AllEffects); + + return 0; +} diff --git a/effects/optimization.md b/effects/optimization.md new file mode 100644 index 00000000..f85523fb --- /dev/null +++ b/effects/optimization.md @@ -0,0 +1,84 @@ +# Hand-optimizing C code by looking at the generated assembly + +Purpose of this document is to present several optimization techniques which can be employed to speed up code generated by gcc for the m68k. + +Our tool of choice is going to be `tools/disass.py` script, which produces a disassembly output along with cycle count per each instruction and per each function. + +## General rules of thumb: + +1. Less code = less cycles. This may sound trivial, but will be our guiding principle. +2. Shorter instructions are faster (and consume less bus cycles) because the CPU doesn't have to fetch as many bytes + +## Put constants used in loops in registers + +We want tight loops to have as little instructions as possible, preferably short ones. If you notice long instructions in a loop that are 3-4 bytes long and contain a constant, you should place that constant in a variable before the loop (that should effectively place it in a register) and use that variable inside the loop instead. + +## Use shorts instead of u_shorts + +Compiler likes `shorts`. Try to avoid using `u_short`s when you don't absolutely have to - `short`s tend to generate fewer instructions because the compiler doesn't have to work around the sign bit in some cases. Always compare the generated assembly before and after the change though - in rare cases the generated assembly will be faster with `u_short`s. + +## Optimizing loops - force use of dedicated looping instruction + +Loops in the form of `for (i = n - 1; i >= 0; i--)` (or any other that can be rewritten into this form) can take advantage of a dedicated `dbf` - decrement and branch instruction. General form is: + +```C +short n = upper_limit - 1; +do { + // ... +} while (--n != -1); +``` + +## Avoid using globals in loops + +Accessing memory with global variables makes the instruction longer and therefore slower. If you need to use a global symbol in your function, pass it as a parameter - you can pass up to 4 arguments in registers before the compiler will start putting them on the stack. They end up in registers d0, d1, a0, a1. If there are more than 2 data arguments, the compiler will put them in the address registers and vice versa. +Alternatively you can force putting a specific variable in a specific register, e.g: +`register volatile void *ptr asm("a5") = &some_global;` + +# Data structure design + +## Place data in memory in the order of access + +By accessing consecutive elements in memory you can take advantage of a dedicated instruction on m68k which does both the memory access and pointer incrementing. +Following example demonstrates how to make the compiler generate it, used with the previous technique for optimizing loops: + +``` +short *pts = (short *)some_array; +short n = some_array_length - 1; +do { + short x = (*pts++); + short y = (*pts++); + // ... +} while (--n != -1) +``` + +This also works with structs and arrays of structs. + +Here's a more complicated example for fast blitter configuration. In this case we take advantage of the fact that blitter's registers are laid out consecutively in memory. Starting at `bltcon1`'s address we start filling consecutive memory addresses which correspond to some blitter registers. Order of all blitter registers (and more) can be checked in `include/custom_regdef.h`. + +``` +register volatile void *ptr asm("a5") = &custom_->bltcon1; + +/* Comment it out if you're feeling lucky! */ +_WaitBlitter(custom_); + +*((short *)ptr)++ = shift; // bltcon1 +*((int *)ptr)++ = mask; // bltaltwm & bltafwm +*((int *)ptr)++ = (int)dstpt; // bltcpt +*((int *)ptr)++ = (int)srcpt; // bltbpt +ptr += 4; // bltapt +*((int *)ptr)++ = (int)dstpt; // bltdpt +*((short *)ptr)++ = BLTSIZE; // bltsize +``` + +## Move as much calculations to the data as you (reasonably) can + +Consider a simple example, where there's always some calculations taking place in a loop: + +``` +for (i = 0; i < n; i++) { + short x = t[i] + SOME_CONST; + // ... +} +``` + +If `SOME_CONST` is always the same, values in `t` can be modified to already have `SOME_CONST` added from the beginning, so that it doesn't have to be added in every loop iteration. \ No newline at end of file