Skip to content

Commit 580e786

Browse files
committed
change math.crc32 to the same algorithm as pkzip/zlib uses (ISO-HDLC). Add math.crc32_end_result(). Fix a parse error in profiler.py script.
1 parent c0ae35b commit 580e786

File tree

7 files changed

+111
-66
lines changed

7 files changed

+111
-66
lines changed

compiler/res/prog8lib/math.p8

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -536,11 +536,9 @@ log2_tab
536536
eor cx16.r15H
537537
sta cx16.r15H
538538
ldy #8
539-
- lda cx16.r15H
540-
asl cx16.r15L
539+
- asl cx16.r15L
541540
rol cx16.r15H
542-
and #$80
543-
beq +
541+
bcc +
544542
lda cx16.r15H
545543
eor #$10
546544
sta cx16.r15H
@@ -554,11 +552,9 @@ log2_tab
554552
; orignal prog8 code was:
555553
; cx16.r15H ^= value
556554
; repeat 8 {
557-
; if cx16.r15H & $80 !=0 {
558-
; cx16.r15 <<=1
555+
; cx16.r15<<=1
556+
; if_cs
559557
; cx16.r15 ^= $1021
560-
; } else
561-
; cx16.r15<<=1
562558
; }
563559
}
564560

@@ -568,7 +564,7 @@ log2_tab
568564
}
569565

570566
sub crc32(uword data, uword length) {
571-
; Calculates the CRC-32 (POSIX) checksum of the buffer.
567+
; Calculates the CRC-32 (ISO-HDLC/PKZIP) checksum of the buffer.
572568
; because prog8 doesn't have 32 bits integers, we have to split up the calculation over 2 words.
573569
; result stored in cx16.r14 (low word) and cx16.r15 (high word)
574570
; There are also "streaming" crc32_start/update/end routines below, that allow you to calculate crc32 for data that doesn't fit in a single memory block.
@@ -586,62 +582,75 @@ log2_tab
586582
; start the "streaming" crc32
587583
; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
588584
; if your code uses these, it must save/restore them before calling this routine
589-
cx16.r14 = cx16.r15 = 0
585+
cx16.r14 = cx16.r15 = $ffff
590586
}
591587

592-
asmsub crc32_update(ubyte value @A) {
588+
sub crc32_update(ubyte value) {
593589
; update the "streaming" crc32 with next byte value
594590
; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
595591
; if your code uses these, it must save/restore them before calling this routine
592+
; implementation detail: see https://stackoverflow.com/a/75951866 , the zlib crc32 is the "reflected" variant
596593
%asm {{
597-
eor cx16.r15H
598-
sta cx16.r15H
594+
eor cx16.r14L
595+
sta cx16.r14L
599596
ldy #8
600-
- lda cx16.r15H
601-
asl cx16.r14L
602-
rol cx16.r14H
603-
rol cx16.r15L
604-
rol cx16.r15H
605-
and #$80
606-
beq +
597+
- lsr cx16.r15H
598+
ror cx16.r15L
599+
ror cx16.r14H
600+
ror cx16.r14L
601+
bcc +
607602
lda cx16.r15H
608-
eor #$04
603+
eor #$ed
609604
sta cx16.r15H
610605
lda cx16.r15L
611-
eor #$c1
606+
eor #$b8
612607
sta cx16.r15L
613608
lda cx16.r14H
614-
eor #$1d
609+
eor #$83
615610
sta cx16.r14H
616611
lda cx16.r14L
617-
eor #$b7
612+
eor #$20
618613
sta cx16.r14L
619614
+ dey
620615
bne -
621616
rts
622617
}}
623618
; original prog8 code:
624-
; cx16.r15H ^= value
619+
; cx16.r14L ^= value
625620
; repeat 8 {
626-
; if cx16.r15H & $80 !=0 {
627-
; cx16.r14 <<= 1
628-
; rol(cx16.r15)
629-
; cx16.r15 ^= $04c1
630-
; cx16.r14 ^= $1db7
631-
; }
632-
; else {
633-
; cx16.r14 <<= 1
634-
; rol(cx16.r15)
621+
; cx16.r15 >>= 1
622+
; ror(cx16.r14)
623+
; if_cs {
624+
; cx16.r15 ^= $edb8
625+
; cx16.r14 ^= $8320
635626
; }
636627
; }
637-
638628
}
639629

640630
sub crc32_end() {
641631
; finalize the "streaming" crc32
642632
; result stored in cx16.r14 (low word) and cx16.r15 (high word)
643-
cx16.r15 ^= $ffff
644-
cx16.r14 ^= $ffff
633+
void crc32_end_result()
634+
}
635+
636+
asmsub crc32_end_result() -> uword @R15, uword @R14 {
637+
; finalize the "streaming" crc32
638+
; returns the result value in cx16.r15 (high word) and r14 (low word)
639+
%asm {{
640+
lda cx16.r15H
641+
eor #255
642+
sta cx16.r15H
643+
lda cx16.r15L
644+
eor #255
645+
sta cx16.r15L
646+
lda cx16.r14H
647+
eor #255
648+
sta cx16.r14H
649+
lda cx16.r14L
650+
eor #255
651+
sta cx16.r14L
652+
rts
653+
}}
645654
}
646655

647656

compiler/res/prog8lib/virtual/math.p8

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -339,10 +339,9 @@ math {
339339
; if your code uses that, it must save/restore it before calling this routine
340340
cx16.r15H ^= value
341341
repeat 8 {
342-
if cx16.r15H & $80 !=0
343-
cx16.r15 = (cx16.r15<<1)^$1021
344-
else
345-
cx16.r15<<=1
342+
cx16.r15<<=1
343+
if_cs
344+
cx16.r15 ^= $1021
346345
}
347346
}
348347

@@ -352,7 +351,7 @@ math {
352351
}
353352

354353
sub crc32(uword data, uword length) {
355-
; Calculates the CRC-32 (POSIX) checksum of the buffer.
354+
; Calculates the CRC-32 (ISO-HDLC/PKZIP) checksum of the buffer.
356355
; because prog8 doesn't have 32 bits integers, we have to split up the calculation over 2 words.
357356
; result stored in cx16.r14 (low word) and cx16.r15 (high word)
358357
; There are also "streaming" crc32_start/update/end routines below, that allow you to calculate crc32 for data that doesn't fit in a single memory block.
@@ -370,24 +369,21 @@ math {
370369
; start the "streaming" crc32
371370
; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
372371
; if your code uses these, it must save/restore them before calling this routine
373-
cx16.r14 = cx16.r15 = 0
372+
cx16.r14 = cx16.r15 = $ffff
374373
}
375374

376375
sub crc32_update(ubyte value) {
377376
; update the "streaming" crc32 with next byte value
378377
; note: tracks the crc32 checksum in cx16.r14 and cx16.r15!
379378
; if your code uses these, it must save/restore them before calling this routine
380-
cx16.r15H ^= value
379+
; implementation detail: see https://stackoverflow.com/a/75951866 , the zlib crc32 is the "reflected" variant
380+
cx16.r14L ^= value
381381
repeat 8 {
382-
if cx16.r15H & $80 !=0 {
383-
cx16.r14 <<= 1
384-
rol(cx16.r15)
385-
cx16.r15 ^= $04c1
386-
cx16.r14 ^= $1db7
387-
}
388-
else {
389-
cx16.r14 <<= 1
390-
rol(cx16.r15)
382+
cx16.r15 >>= 1
383+
ror(cx16.r14)
384+
if_cs {
385+
cx16.r15 ^= $edb8
386+
cx16.r14 ^= $8320
391387
}
392388
}
393389
}
@@ -399,6 +395,7 @@ math {
399395
cx16.r14 ^= $ffff
400396
}
401397

398+
; there's no crc32_end_result() here because IR cannot return multiple values yet
402399

403400
sub lerp(ubyte v0, ubyte v1, ubyte t) -> ubyte {
404401
; Linear interpolation (LERP)

docs/source/libraries.rst

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -725,16 +725,20 @@ but perhaps the provided ones can be of service too.
725725
"streaming" crc16 calculation routines, when the data doesn't fit in a single buffer.
726726
Tracks the crc16 checksum in cx16.r15! If your code uses that, it must save/restore it before calling this routine!
727727
Call the start() routine first, feed it bytes with the update() routine, finalize with calling the end() routine which returns the crc16 value.
728+
Note: after calling the crc16_end() routine you must start over.
728729

729730
``crc32 (uword data, uword length)``
730-
Calculates a CRC-32 (POSIX) checksum over the given data buffer.
731+
Calculates a CRC-32 (ISO-HDLC/PKZIP) checksum over the given data buffer.
731732
The 32 bits result is stored in cx16.r14 (low word) and cx16.r15 (high word).
732733

733-
``crc32_start() / crc32_update(ubyte value) / crc32_end()``
734+
``crc32_start() / crc32_update(ubyte value) / crc32_end() / crc32_end_result()``
734735
"streaming" crc32 calculation routines, when the data doesn't fit in a single buffer.
735736
Tracks the crc32 checksum in cx16.r14 and cx16.r15! If your code uses these, it must save/restore them before calling this routine!
736737
Call the start() routine first, feed it bytes with the update() routine, finalize with calling the end() routine.
737738
The 32 bits result is stored in cx16.r14 (low word) and cx16.r15 (high word).
739+
Instead of the normal end() routine you can also call crc32_end_result() which finalizes the calculation,
740+
and actually returns the high and low words of the 32 bits result value as two return word values.
741+
Note: after calling the crc32_end() or crc32_end_result() routine you must start over.
738742

739743
``lerp(v0, v1, t)``
740744
Linear interpolation routine for unsigned byte values.

docs/source/technical.rst

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,14 +232,23 @@ Some notes and references into the compiler's source code modules:
232232

233233
Run-time memory profiling with the X16 emulator
234234
-----------------------------------------------
235+
236+
The compiler has the ``-dumpvars`` switch that will print a list of all variables and where they are placed into memory.
237+
This can be useful to track which variables end up in zeropage for instance. But it doesn't really show if the choices
238+
made are good, i.e. if the variables that are actually the most used in your program, are placed in zeropage.
239+
240+
But there is a way to actually *measure* the behavior of your program as it runs on the X16.
241+
See it as a simple way of *profiling* your program to find the hotspots that maybe need optimizing:
242+
235243
The X16 emulator has a ``-memorystats`` option that enables it to keep track of memory access count statistics,
236244
and write the accumulated counts to a file on exit.
237-
Prog8 includes a Python script ``profiler.py`` (find it in the "scripts" subdirectory of the source code distribution)
238-
that can cross-reference that file with an assembly listing produced by the compiler with the ``-asmlist`` option.
245+
Prog8 then provides a Python script ``profiler.py`` (find it in the "scripts" subdirectory of the source code distribution,
246+
or `online here <https://github.com/irmen/prog8/blob/master/scripts/profiler.py>`_).
247+
This script cross-references the memory stats file with an assembly listing of the program, produced by the Prog8 compiler with the ``-asmlist`` option.
239248
It then prints the top N lines in your (assembly) program source that perform the most reads and writes,
240249
which you can use to identify possible hot spots/bottlenecks/variables that should be better placed in zeropage etc.
241-
Note that the profiler just works with the number of accesses to memory locations, this is *not* the same
242-
as the most run-time (cpu instructions cycle times aren't taken into account at all).
250+
Note that the profiler simply works with the total number of accesses to memory locations.
251+
This is *not* the same as the most run-time (cpu instructions cycle times aren't taken into account at all)!
243252
Here is an example of the output it generates::
244253

245254
$ scripts/profiler.py -n 10 cobramk3-gfx.list memstats.txt  ✔
@@ -274,4 +283,10 @@ Here is an example of the output it generates::
274283
$01e7 (1280140) : cpu stack
275284
$0264 (1258159) : unknown
276285

277-
Apparently the most cpu activity while running this program is spent in a division routine.
286+
Apparently the most cpu activity while running this program is spent in a division routine which uses the 'remainder' and 'dividend' variables.
287+
As you can see, sometimes even actual assembly instructions end up in the tables above if they are in a routine that is executed very often (the 'stz' instructions in this example).
288+
The tool isn't powerful enough to see what routine the variables or instructions are part of, but it prints the line number in the assembly listing file so you can investigate that manually.
289+
290+
You can see in the example above that the variables that are among the most used are neatly placed in zeropage already.
291+
If you see for instance a variable that is heavily used and that is *not* in zeropage, you
292+
could consider adding ``@zp`` to that variable's declaration to prioritize it to be put into zeropage.

docs/source/todo.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
TODO
22
====
33

4-
- write something in the docs about how to optimize your program on the x16 using the -dumpvars option,
5-
the emulator's memory profiler + the profiler.py script to find hotspots for routines and variables that could be placed into zeropage
64
- announce prog8 on the 6502.org site?
75

86
...
@@ -24,6 +22,7 @@ Future Things and Ideas
2422
But all library code written in asm uses .proc already..... (textual search/replace when writing the actual asm?)
2523
Once new codegen is written that is based on the IR, this point is mostly moot anyway as that will have its own dead code removal on the IR level.
2624

25+
- Allow normal subroutines to return multiple values as well (just as asmsubs already can)
2726
- Change scoping rules for qualified symbols so that they don't always start from the root but behave like other programming languages (look in local scope first)
2827
- Fix missing cases where regular & has to return the start of the split array in memory whatever byte comes first. Search TODO("address of split word array")
2928
- something to reduce the need to use fully qualified names all the time. 'with' ? Or 'using <prefix>'?

examples/test.p8

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,31 @@
11
%import textio
2-
%import diskio
2+
%import math
33

44
%zeropage basicsafe
55
%option no_sysinit
66

77

88
main {
99
sub start() {
10-
cx16.r2 = diskio.get_loadaddress("test.prg")
11-
txt.print_uwhex(cx16.r2, true)
10+
str input = iso:"the quick brown fox jumps over the lazy dog"
11+
12+
13+
txt.print_uwhex(math.crc16(input, len(input)), true)
14+
txt.nl()
15+
16+
math.crc32(input, len(input))
17+
18+
txt.print_uwhex(cx16.r15, true)
19+
txt.print_uwhex(cx16.r14, false)
20+
txt.nl()
21+
22+
math.crc32_start()
23+
for cx16.r0L in input
24+
math.crc32_update(cx16.r0L)
25+
uword hiw,low
26+
hiw,low = math.crc32_end_result()
27+
txt.print_uwhex(hiw, true)
28+
txt.print_uwhex(low, false)
29+
txt.nl()
1230
}
1331
}

scripts/profiler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
and prints out what assembly lines and variables were read from and written to the most.
88
These may indicate hot paths or even bottlenecks in your program,
99
and what variables in system ram might be better placed in Zeropage.
10+
11+
Also see https://prog8.readthedocs.io/en/latest/technical.html#run-time-memory-profiling-with-the-x16-emulator
12+
for an example of how to use this tool together with the X16 emulator.
1013
"""
1114

1215

@@ -26,7 +29,7 @@ def __init__(self, filename: str) -> None:
2629
if not line or line == '\n' or line[0] == ';':
2730
continue
2831
if line[0] == '=':
29-
value, symbol, _ = line.split(maxsplit=2)
32+
value, symbol = line.split(maxsplit=2)[:2]
3033
value = value[1:]
3134
if value:
3235
try:

0 commit comments

Comments
 (0)