diff options
Diffstat (limited to 'src/engine/btrace.c')
-rw-r--r-- | src/engine/btrace.c | 596 |
1 files changed, 596 insertions, 0 deletions
diff --git a/src/engine/btrace.c b/src/engine/btrace.c new file mode 100644 index 0000000..3c02570 --- /dev/null +++ b/src/engine/btrace.c @@ -0,0 +1,596 @@ +#include <config.h> +#ifndef _plan9_ +#ifndef NO_MALLOC_H +#include <malloc.h> +#endif +#include <math.h> +#include <stdio.h> +#include <string.h> +#include <limits.h> +#else +#include <u.h> +#include <libc.h> +#include <stdio.h> +#endif +#define SLARGEITER +#include <filter.h> +#include <fractal.h> +#include <xthread.h> +/* + * This is an implementation of famous boundary trace algorithm. + * See fractint documentation if you don't know what this means :) + * + * Here is two different implentation of this algorithm - one is faster + * and second is threadable. + * + * An faster one is quite usual implementation - get first uncalculated pixel, + * trace algorithm using labirinth "right hand rule" way (well, I currently + * use left hand rule) then trace same boundary again and fill. It is + * implemented in tracerectangle/tracecolor + * + * An threaded one follows description I sent to sci.fractals: + Hi + few weeks ago I asked for multithreaded algorithm for boundary trace + I received following reply by Paul Derbyshire + > One method is this. One b-trace algorithm pushes pixels onto a stack. + > Initially the screen border pixels are all pushed. Then a loop starts. A + > pixel is popped and calculated or recalled, with 4 orthogonal neighbors + > calculated or recalled if already calculated; if the central pixel is at a + > color boundary the neighbors are pushed. The image is done when the stack + > hits empty, then empty areas of image are floodfilled by their boundary + > color. (Fractint does it differently, not sure how). By this method, the + > stack will usually have at least 4 pixels, and so four substacks can be + > assigned to each of 4 processors, and each processor has a "processor to + > its left" designated as if they were in a "logical" ring. Each processor + > pushes new pixels on the processor to its left's substack, and pops from + > its own. This way, busy parts of the image wind up spread among all + > processors. By adding substacks, this can be expanded to accomodate more + > processors. Some amount is optimal, after which a point of diminishing + > returns is reached when most processors only do a few pixels and spend + > most of their time waiting for new stuff from the processor to its right. + > You'll need to figure out this optimum somehow; I can't guess what it + > would be. Probably around 64 processors. (More than that, you would do + > well just to assign separate processors small rectangular subsets of the + > image anyways.) Also, the end is only reached when NO processors have + > anything in their stacks. + This method looks very interesting but has few serious problems. + Most significant probably is that it always caluclates pixels up + to distance 3 from boundary. Simple modification should lower it + to distance 2. But "right hand rule" based algorithm should actualy + calculate points just to distance 1. So I want to implement such alrogithm, + since number of caluclated points is still significant. + + So I think I have to extend stack for two informations: + 1) direction + 2) color of boundary I am tracking(not color I am at) + and main algorithm should look like: + 1) detect color of current point + 2) look right. Is there same color? + yes:add point at the right to stack and exit + is there boundary color? + no:we are meeting boundary with different color - so we need + to track this boundary too. add point at right to stack with oposite + direction and boundary color set to current color + 3) look forward: similiar scheme as to look right + 4) look left: again similiar + 5) exit + + This hould trace boundaries to distance 1 (I hope) and do not miss anything. + Problem is that this algorithm never ends, since boundaries will be rescaned + again and again. So I need to add an caluclated array wich should looks like: + for every point has mask for all directions that were scaned+calculated mask + (set to 1 if pixel is already calculated)+inprocess mask(set to 1 if some + other processor is currently calculating it) + + Scan masks should be set in thime when pixel is added to stack and pixel + should not be added in case that mask is already set. I don't this that locks + to this array is required since time spent by setting masks should be quite + small and possible race conditions should not be drastical(except recalculating + some point) + + I was also thinking about perCPU queues. I think that one queue is OK, + it is simplier and should not take much more time(except it will be locked + more often but time spend in locked queue in comparsion to time spent + in rest should be so small so this should not be drastical for less than 10 + procesors) + + At the other hand - perCPU queues should have one advantage - each + cpu should own part of image and points from its part will be added to + this cpu. This should avoid procesor cache conflict and speed up process. + At the other hand, when cpu's queue is empty, cpu will have to browse + others queues too and steal some points from other CPU, wich should introduce + some slowdown and I am not sure if this way will bring any improvement. + + Other think that should vote for perCPU queue is fact, that in one CPU + queue should be more often situations when queue is empty since one procesor + is caluclating points and other has wait, since it had tendency to trace + just one boundary at the time. At the other hand, most of boundaries should + cross broders, so they should be added to queue at the initialization, so + this should be OK. + + I am beginer to threds, SMP etc. So I looking for ideas, and suggestions + to improve this alg. (or design different one). + Also someone with SMP box, who should test me code is welcomed. + BTW whats is the average/maximal number of CPU in todays SMP boxes? + + Please reply directly to my email:hubicka@paru.cas.cz + + Thanks + Honza + * This way is implemented in tracerectangle2/tracepoint. It is enabled + * just when threads are compiled in. Also when nthreads=1, old faster + * algorithm is used. + * + * Implementation notes: + * 1) I decided to use one queue instead of stack, since I expect, it will + * have tendency to trace all boundaries at the time, not just one. + * This will make queue bigger and reduce probability of situation, where + * queue is empty and other processors has to wait for one, that is + * calculating and should add something there (maybe :) + * 2) Stack (queue :) is used just when neccesary - in situations where queue + * is quite full (there is more items than 10) procesor just continues in + * tracing path it started. This reduces number of slow stack operations, + * locks/unlocks, cache conflicts and other bad thinks. + * 3) Just each fourth pixel should be added into queue + * 4) Foodfill algorithm should be avoided since colors at the boundaries + * are always correct, we should simply go trought each scanline and when + * pixel is uncalcualted, copy value from its left neighbor + * + * Current implementation has about 6% lower results than "fast" algorithm + * using one thread. When two threads enabled (at my one processor linux + * box) lock/unlock overhead eats next 8%, three threads eats next 1% :) + */ +#ifdef HAVE_ALLOCA_H +#include <alloca.h> +#endif +#include <archaccel.h> + +#include <filter.h> +#include <btrace.h> +#include <plane.h> +#include "calculate.h" + +#define UP 0 +#define RIGHT 1 +#define DOWN 2 +#define LEFT 3 +#define turnleft(d) (((d)+3)&3) +#define turnright(d) (((d)+1)&3) +#define turnoposite(d) (((d)+2)&3) +#define callwait() if(cfilter.wait_function!=NULL) cfilter.wait_function(&cfilter); + + +#ifndef nthreads +static int size; +static unsigned int inset; +static int nwaiting; +static int exitnow; +#define PAGESHIFT 14 +#define PAGESIZE (1<<PAGESHIFT) +#define MAXPAGES 200 /*Well limit is about 6MB of stack..Hope it will never owerflow */ +#define MAXSIZE (MAXPAGES*PAGESIZE-1) +struct stack { + int color; + short x, y; + char direction; +}; +static int npages[MAXTHREADS]; +static struct stack *pages[MAXPAGES]; +static struct stack **starts[MAXTHREADS]; +static int sizes[MAXTHREADS]; +static int maxsize, maxsize2; +static CONST char dirrections[][2] = { + {0, -1}, + {1, 0}, + {0, 1}, + {-1, 0}, +}; + +#define addstack(sx,sy,d,c,periodicity) { \ + int page; \ + int nstack=(((sy)-ystart)*nthreads)/(yend-ystart+1); \ + struct stack *ptr; \ + calculated[sx+sy*CALCWIDTH]|=1<<d; \ + xth_lock(0); \ + if(size<maxsize2) {\ + while(sizes[nstack]>=maxsize) if(nstack>=nthreads-1) nstack=0; else nstack++; \ + page=sizes[nstack]>>PAGESHIFT; \ + if(page==npages[nstack]) starts[nstack][npages[nstack]]=(struct stack *)malloc(sizeof(struct stack)*PAGESIZE),npages[nstack]++;\ + ptr=starts[nstack][page]+(sizes[nstack]&(PAGESIZE-1)); \ + ptr->x=sx; \ + ptr->y=sy; \ + if(periodicity) \ + ptr->direction=d|8; else \ + ptr->direction=d; \ + ptr->color=c; \ + size++; \ + sizes[nstack]++; \ + if(nwaiting) xth_wakefirst(0); \ + } \ + xth_unlock(0); \ +} +/*Non locking one used by init code */ +#define addstack1(sx,sy,d,c) { \ + int page; \ + struct stack *ptr; \ + int nstack=(((sy)-y1)*nthreads)/(y2-y1+1); \ + calculated[sx+sy*CALCWIDTH]|=1<<d; \ + if(size<maxsize2) {\ + while(sizes[nstack]>=maxsize) if(nstack==nthreads-1) nstack=0; else nstack++; \ + page=sizes[nstack]>>PAGESHIFT; \ + if(page==npages[nstack]) starts[nstack][npages[nstack]]=(struct stack *)malloc(sizeof(struct stack)*PAGESIZE),npages[nstack]++; \ + ptr=starts[nstack][page]+(sizes[nstack]&(PAGESIZE-1)); \ + ptr->x=sx; \ + ptr->y=sy; \ + ptr->direction=d|8; \ + ptr->color=c; \ + size++; \ + sizes[nstack]++; \ + } \ +} +static int xstart, ystart, xend, yend; +#endif + +static unsigned char *calculated; +#define CALCULATED 16 +#define CALCULATING 32 +#define CALCWIDTH cimage.width + +static number_t *xcoord, *ycoord; +#ifndef inline +REGISTERS(3) +CONSTF static pixel32_t calculatepixel(int x, int y, int peri) +{ + return (calculate(xcoord[x], ycoord[y], peri)); +} +#else +#define calculatepixel(x,y,peri) (calculate(xcoord[x],ycoord[y],peri)) +#endif +#define putpixel(x,y,c) p_setp((cpixel_t *)cimage.currlines[y],x,c) +#define getpixel(x,y) p_getp((cpixel_t *)cimage.currlines[y],x) +#include <c256.h> +#define tracecolor tracecolor8 +#define tracepoint tracepoint8 +#define dosymetries dosymetries8 +#define queue queue8 +#define bfill bfill8 +#include "btraced.c" +#include <hicolor.h> +#define tracecolor tracecolor16 +#define tracepoint tracepoint16 +#define dosymetries dosymetries16 +#define queue queue16 +#define bfill bfill16 +#include "btraced.c" +#include <true24.h> +#define tracecolor tracecolor24 +#define tracepoint tracepoint24 +#define dosymetries dosymetries24 +#define queue queue24 +#define bfill bfill24 +#include "btraced.c" +#include <truecolor.h> +#define tracecolor tracecolor32 +#define tracepoint tracepoint32 +#define dosymetries dosymetries32 +#define queue queue32 +#define bfill bfill32 +#include "btraced.c" + +#ifdef HAVE_GETTEXT +#include <libintl.h> +#else +#define gettext(STRING) STRING +#endif + +#ifndef SLOWCACHESYNC +#ifndef nthreads +static int tracerectangle2(int x1, int y1, int x2, int y2) +{ + int x, y; + cfilter.max = y2 - y1; + cfilter.pass = gettext("Boundary trace"); + cfilter.pos = 0; + maxsize = MAXPAGES / nthreads; + for (y = 0; y < nthreads; y++) { + npages[y] = 0; /*stack is empty */ + sizes[y] = 0; + starts[y] = pages + y * maxsize; + } + maxsize *= PAGESIZE; + maxsize2 = maxsize * nthreads; + size = 0; + nwaiting = 0; + exitnow = 0; + inset = cpalette.pixels[0]; + for (y = y1; y <= y2; y++) { + memset_long(calculated + x1 + y * CALCWIDTH, 0, x2 - x1 + 1); + } + for (x = x1; x <= x2; x += 4) { + addstack1(x, y1, LEFT, INT_MAX); + addstack1(x, y2, RIGHT, INT_MAX); + } + for (y = y1; y <= y2; y += 4) { + addstack1(x1, y, DOWN, INT_MAX); + addstack1(x2, y, UP, INT_MAX); + } + xstart = x1; + ystart = y1; + xend = x2; + yend = y2; + switch (cimage.bytesperpixel) { + case 1: + xth_function(queue8, NULL, 1); + xth_sync(); + xth_function(bfill8, NULL, yend - ystart - 1); + break; +#ifdef SUPPORT16 + case 2: + xth_function(queue16, NULL, 1); + xth_sync(); + xth_function(bfill16, NULL, yend - ystart - 1); + break; +#endif +#ifdef STRUECOLOR24 + case 3: + xth_function(queue24, NULL, 1); + xth_sync(); + xth_function(bfill24, NULL, yend - ystart - 1); + break; +#endif + case 4: + xth_function(queue32, NULL, 1); + xth_sync(); + xth_function(bfill32, NULL, yend - ystart - 1); + break; + } + xth_sync(); + for (y = 0; y < nthreads; y++) + for (x = 0; x < npages[y]; x++) + free(starts[y][x]); /*free memory allocated for stack */ + return 1; +} +#endif +#endif +static void skip(int x1, int y1, int x2, int y2) +{ + int src = y1; + int xstart = x1 * cimage.bytesperpixel; + int xsize = (x2 - x1 + 1) * cimage.bytesperpixel; + y1++; + for (; y1 <= y2; y1++) { + memcpy(cimage.currlines[y1] + xstart, + cimage.currlines[src] + xstart, xsize); + ycoord[y1] = ycoord[src]; + } +} + +static int tracerectangle(int x1, int y1, int x2, int y2) +{ + int x, y; + unsigned char *calc; + cfilter.max = y2 - y1; + cfilter.pass = gettext("Boundary trace"); + cfilter.pos = 0; + for (y = y1; y <= y2; y++) { + memset_long(calculated + x1 + y * CALCWIDTH, 0, + (size_t) (x2 - x1 + 1)); + } + switch (cimage.bytesperpixel) { + case 1: + for (y = y1; y <= y2; y++) { + calc = calculated + y * CALCWIDTH; + for (x = x1; x <= x2; x++) + if (!calc[x]) { + tracecolor8(x1, y1, x2, y2, x, y); + } + cfilter.pos = y - y1; + callwait(); + if (cfilter.interrupt) { + skip(x1, y, x2, y2); + return 0; + } + } + break; +#ifdef SUPPORT16 + case 2: + for (y = y1; y <= y2; y++) { + calc = calculated + y * CALCWIDTH; + for (x = x1; x <= x2; x++) + if (!calc[x]) { + tracecolor16(x1, y1, x2, y2, x, y); + } + cfilter.pos = y - y1; + callwait(); + if (cfilter.interrupt) { + skip(x1, y, x2, y2); + return 0; + } + } + break; +#endif +#ifdef STRUECOLOR24 + case 3: + for (y = y1; y <= y2; y++) { + calc = calculated + y * CALCWIDTH; + for (x = x1; x <= x2; x++) + if (!calc[x]) { + tracecolor24(x1, y1, x2, y2, x, y); + } + cfilter.pos = y - y1; + callwait(); + if (cfilter.interrupt) { + skip(x1, y, x2, y2); + return 0; + } + } +#endif + case 4: + for (y = y1; y <= y2; y++) { + calc = calculated + y * CALCWIDTH; + for (x = x1; x <= x2; x++) + if (!calc[x]) { + tracecolor32(x1, y1, x2, y2, x, y); + } + cfilter.pos = y - y1; + callwait(); + if (cfilter.interrupt) { + skip(x1, y, x2, y2); + return 0; + } + } + break; + } + return 1; +} + +int +boundarytrace(int x1, int y1, int x2, int y2, number_t * xpos, + number_t * ypos) +{ + int i; + int i1; + int xsym, ysym; + int cy1, cy2; + int cx1, cx2; + int ydiv; +#ifdef HAVE_ALLOCA + calculated = (unsigned char *) alloca(cimage.width * (y2 + 1)); +#else + calculated = (unsigned char *) malloc(cimage.width * (y2 + 1)); +#endif + if (calculated == NULL) { + return 0; + } + xcoord = xpos; + ycoord = ypos; + + + if (cursymetry.xsym < cfractalc.rs.nc + || cursymetry.xsym > cfractalc.rs.mc) + xsym = -10; + else + xsym = + (int) (0.5 + + ((cursymetry.xsym - + cfractalc.rs.nc) * cimage.width / (cfractalc.rs.mc - + cfractalc.rs.nc))); + if (cursymetry.ysym < cfractalc.rs.ni + || cursymetry.ysym > cfractalc.rs.mi) + ysym = -10; + else + ysym = + (int) (0.5 + + ((cursymetry.ysym - + cfractalc.rs.ni) * cimage.height / (cfractalc.rs.mi - + cfractalc.rs. + ni))); + ydiv = + (int) (0.5 + + ((-cfractalc.rs.ni) * cimage.height / + (cfractalc.rs.mi - cfractalc.rs.ni))); + if (xsym > x1 && xsym < x2) { + if (xsym - x1 > x2 - xsym) + cx1 = x1, cx2 = xsym; + else + /*xsym--, */ cx1 = xsym, cx2 = x2; + } else + xsym = -1, cx1 = x1, cx2 = x2; + if (ysym > y1 && ysym < y2) { + if (ysym - y1 > y2 - ysym) + cy1 = y1, cy2 = ysym; + else + cy1 = ysym, cy2 = y2; + } else + ysym = -1, cy1 = y1, cy2 = y2; + for (i = cx1; i <= cx2; i++) { + xcoord[i] = + cfractalc.rs.nc + i * (cfractalc.rs.mc - + cfractalc.rs.nc) / cimage.width; + } + for (i = cy1; i <= cy2; i++) { + ycoord[i] = + cfractalc.rs.ni + i * (cfractalc.rs.mi - + cfractalc.rs.ni) / cimage.height; + } + i = 1; +#ifndef SLOWCACHESYNC +#ifndef nthreads + if (nthreads != 1) { + if (ydiv > cy1 && ydiv < cy2) { + i |= tracerectangle2(cx1, cy1, cx2, ydiv), + i |= tracerectangle2(cx1, ydiv, cx2, cy2); + } else + i |= tracerectangle2(cx1, cy1, cx2, cy2); + } else +#endif +#endif + { + if (ydiv > cy1 && ydiv < cy2) { + i |= tracerectangle(cx1, cy1, cx2, ydiv), + i |= tracerectangle(cx1, ydiv, cx2, cy2); + } else + i |= tracerectangle(cx1, cy1, cx2, cy2); + } + if (!i) { +#ifndef HAVE_ALLOCA + free(calculated); +#endif + return 0; + } +#ifndef HAVE_ALLOCA + free(calculated); +#endif + drivercall(cimage, + dosymetries8(x1, x2, y1, y2, xsym, cx1, cx2), + dosymetries16(x1, x2, y1, y2, xsym, cx1, cx2), + dosymetries24(x1, x2, y1, y2, xsym, cx1, cx2), + dosymetries32(x1, x2, y1, y2, xsym, cx1, cx2)); + for (i = cx1; i <= cx2; i++) { + if (xsym != -1) { + i1 = 2 * xsym - i; + if (i1 >= x1 && i1 <= x2 && i != i1) + xcoord[i1] = 2 * cursymetry.xsym - xcoord[i]; + } + } + for (i = cy1; i <= cy2; i++) { + if (ysym != -1) { + i1 = 2 * ysym - i; + if (i1 >= y1 && i1 <= y2 && i != i1) + ycoord[i1] = 2 * cursymetry.ysym - ycoord[i]; + } + } + if (cy1 != y1) { + register int yy1, yy2; + int xstart = x1 * cimage.bytesperpixel; + int xsize = (x2 - x1 + 1) * cimage.bytesperpixel; + yy1 = y1; + yy2 = 2 * ysym - y1; + while (yy1 < yy2) { + memcpy(cimage.currlines[yy1] + xstart, + cimage.currlines[yy2] + xstart, (size_t) xsize); + yy1++; + yy2--; + } + } + if (cy2 != y2) { + register int yy1, yy2; + int xstart = x1 * cimage.bytesperpixel; + int xsize = (x2 - x1 + 1) * cimage.bytesperpixel; + yy1 = y2; + yy2 = 2 * ysym - y2; + while (yy1 > yy2) { + memcpy(cimage.currlines[yy1] + xstart, + cimage.currlines[yy2] + xstart, (size_t) xsize); + yy1--; + yy2++; + } + } + return 1; +} + +int boundarytraceall(number_t * xpos, number_t * ypos) +{ + return (boundarytrace + (0, 0, cimage.width - 1, cimage.height - 1, xpos, ypos)); +} |