Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/pdf/xpdf/TextOutputDev.cc
diff options
context:
space:
mode:
authorMartin Kretzschmar <mkretzschmar@src.gnome.org>2003-03-31 21:08:43 (GMT)
committer Martin Kretzschmar <mkretzschmar@src.gnome.org>2003-03-31 21:08:43 (GMT)
commit64676031423465996e83c4a685290f0c3d97a249 (patch)
treecef578bc8c300722abb3fe1693181e68895a94a7 /pdf/xpdf/TextOutputDev.cc
parent28c37dbcf87665a4ccec58bef9ef8ff0697022dd (diff)
kill traces of ltk, incorporate new sources
* xpdf/Makefile.am: kill traces of ltk, incorporate new sources * Makefile.am, configure.in: don't build the ltk subdir * ANNOUNCE, CHANGES, ChangeLog, README, aconf-dj.h, aconf-win32.h, dj_make.bat, ms_make.bat, vms_make.com: update * xpdf/LTKOutputDev.cc, xpdf/LTKOutputDev.h, xpdf/postscript.xbm, xpdf/xpdf-flip.ltk, xpdf/xpdf-ltk.h, xpdf/xpdf-top.ltk, xpdf/xpdf.ltk: remove. * xpdf/Annot.cc, xpdf/Annot.h, xpdf/Array.cc, xpdf/Array.h, xpdf/BuiltinFont.cc, xpdf/BuiltinFont.h, xpdf/BuiltinFontTables.cc, xpdf/CMap.cc, xpdf/CMap.h, xpdf/Catalog.cc, xpdf/Catalog.h, xpdf/CharCodeToUnicode.cc, xpdf/CharCodeToUnicode.h, xpdf/Decrypt.cc, xpdf/Decrypt.h, xpdf/Dict.cc, xpdf/Dict.h, xpdf/Error.cc, xpdf/Error.h, xpdf/FTFont.cc, xpdf/FTFont.h, xpdf/FontFile.cc, xpdf/FontFile.h, xpdf/Function.cc, xpdf/Function.h, xpdf/Gfx.cc, xpdf/Gfx.h, xpdf/GfxFont.cc, xpdf/GfxFont.h, xpdf/GfxState.cc, xpdf/GfxState.h, xpdf/GlobalParams.cc, xpdf/GlobalParams.h, xpdf/ImageOutputDev.cc, xpdf/ImageOutputDev.h, xpdf/Lexer.cc, xpdf/Lexer.h, xpdf/Link.cc, xpdf/Link.h, xpdf/NameToCharCode.cc, xpdf/NameToCharCode.h, xpdf/NameToUnicodeTable.h, xpdf/Object.cc, xpdf/Object.h, xpdf/OutputDev.cc, xpdf/OutputDev.h, xpdf/PBMOutputDev.cc, xpdf/PBMOutputDev.h, xpdf/PDFDoc.cc, xpdf/PDFDoc.h, xpdf/PSOutputDev.cc, xpdf/PSOutputDev.h, xpdf/PSTokenizer.cc, xpdf/PSTokenizer.h, xpdf/Page.cc, xpdf/Page.h, xpdf/Parser.cc, xpdf/Parser.h, xpdf/SFont.cc, xpdf/SFont.h, xpdf/Stream.cc, xpdf/Stream.h, xpdf/T1Font.cc, xpdf/T1Font.h, xpdf/TTFont.cc, xpdf/TTFont.h, xpdf/TextOutputDev.cc, xpdf/TextOutputDev.h, xpdf/UnicodeMap.cc, xpdf/UnicodeMap.h, xpdf/XOutputDev.cc, xpdf/XOutputDev.h, xpdf/XRef.cc, xpdf/XRef.h, xpdf/config.h, xpdf/pdffonts.cc, xpdf/pdfimages.cc, xpdf/pdfinfo.cc, xpdf/pdftopbm.cc, xpdf/pdftops.cc, xpdf/pdftotext.cc, xpdf/vms_make.com, xpdf/xpdf.cc: update. * goo/GHash.cc, goo/GHash.h, goo/GList.cc, goo/GList.h, goo/GString.cc, goo/GString.h: mostly Mac OS X gcc fixage. * doc/pdffonts.1, doc/pdffonts.cat, doc/pdffonts.hlp, doc/pdfimages.1, doc/pdfimages.cat, doc/pdfimages.hlp, doc/pdfinfo.1, doc/pdfinfo.cat, doc/pdfinfo.hlp, doc/pdftopbm.1, doc/pdftopbm.cat, doc/pdftopbm.hlp, doc/pdftops.1, doc/pdftops.cat, doc/pdftops.hlp, doc/pdftotext.1, doc/pdftotext.cat, doc/pdftotext.hlp, doc/xpdf.1, doc/xpdf.cat, doc/xpdf.hlp, doc/xpdfrc.5, doc/xpdfrc.cat, doc/xpdfrc.hlp: update docs.
Diffstat (limited to 'pdf/xpdf/TextOutputDev.cc')
-rw-r--r--pdf/xpdf/TextOutputDev.cc1191
1 files changed, 876 insertions, 315 deletions
diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc
index 5e5761f..891752c 100644
--- a/pdf/xpdf/TextOutputDev.cc
+++ b/pdf/xpdf/TextOutputDev.cc
@@ -6,11 +6,12 @@
//
//========================================================================
-#ifdef __GNUC__
+#include <aconf.h>
+
+#ifdef USE_GCC_PRAGMAS
#pragma implementation
#endif
-#include <aconf.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
@@ -31,14 +32,100 @@
#endif
//------------------------------------------------------------------------
+
+#define textOutSpace 0.2
+#define textOutColSpace 0.2
+
+//------------------------------------------------------------------------
+
+struct TextOutColumnEdge {
+ double x, y0, y1;
+};
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+class TextBlock {
+public:
+
+ TextBlock();
+ ~TextBlock();
+
+ double xMin, xMax;
+ double yMin, yMax;
+ TextString *strings; // list of strings in the block
+ TextBlock *next; // next block in line
+ TextBlock *xyNext; // next block on xyBlocks list
+ Unicode *text; // Unicode text of the block, including
+ // spaces between strings
+ double *xRight; // right-hand x coord of each char
+ int len; // total number of Unicode characters
+ int convertedLen; // total number of converted characters
+ int *col; // starting column number for each
+ // Unicode character
+};
+
+TextBlock::TextBlock() {
+ strings = NULL;
+ next = NULL;
+ xyNext = NULL;
+ text = NULL;
+ xRight = NULL;
+ col = NULL;
+}
+
+TextBlock::~TextBlock() {
+ TextString *p1, *p2;
+
+ for (p1 = strings; p1; p1 = p2) {
+ p2 = p1->next;
+ delete p1;
+ }
+ gfree(text);
+ gfree(xRight);
+ gfree(col);
+}
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+class TextLine {
+public:
+
+ TextLine();
+ ~TextLine();
+
+ TextBlock *blocks;
+ TextLine *next;
+ double yMin, yMax;
+};
+
+TextLine::TextLine() {
+ blocks = NULL;
+ next = NULL;
+}
+
+TextLine::~TextLine() {
+ TextBlock *p1, *p2;
+
+ for (p1 = blocks; p1; p1 = p2) {
+ p2 = p1->next;
+ delete p1;
+ }
+}
+
+//------------------------------------------------------------------------
// TextString
//------------------------------------------------------------------------
-TextString::TextString(GfxState *state, double fontSize) {
+TextString::TextString(GfxState *state, double x0, double y0,
+ double fontSize) {
GfxFont *font;
double x, y;
- state->transform(state->getCurX(), state->getCurY(), &x, &y);
+ state->transform(x0, y0, &x, &y);
if ((font = state->getFont())) {
yMin = y - font->getAscent() * fontSize;
yMax = y - font->getDescent() * fontSize;
@@ -54,14 +141,14 @@ TextString::TextString(GfxState *state, double fontSize) {
yMin = y;
yMax = y + 1;
}
- col = 0;
+ marked = gFalse;
text = NULL;
xRight = NULL;
len = size = 0;
- yxNext = NULL;
- xyNext = NULL;
+ next = NULL;
}
+
TextString::~TextString() {
gfree(text);
gfree(xRight);
@@ -90,10 +177,11 @@ TextPage::TextPage(GBool rawOrderA) {
rawOrder = rawOrderA;
curStr = NULL;
fontSize = 0;
- yxStrings = NULL;
xyStrings = NULL;
- yxCur1 = yxCur2 = NULL;
+ xyCur1 = xyCur2 = NULL;
+ lines = NULL;
nest = 0;
+ nTinyChars = 0;
}
TextPage::~TextPage() {
@@ -104,7 +192,7 @@ void TextPage::updateFont(GfxState *state) {
GfxFont *font;
double *fm;
char *name;
- int code;
+ int code, mCode, letterCode, anyCode;
double w;
// adjust the font size
@@ -116,18 +204,33 @@ void TextPage::updateFont(GfxState *state) {
// rendering the font. This code tries to guess by looking at the
// width of the character 'm' (which breaks if the font is a
// subset that doesn't contain 'm').
+ mCode = letterCode = anyCode = -1;
for (code = 0; code < 256; ++code) {
- if ((name = ((Gfx8BitFont *)font)->getCharName(code)) &&
- name[0] == 'm' && name[1] == '\0') {
- break;
+ name = ((Gfx8BitFont *)font)->getCharName(code);
+ if (name && name[0] == 'm' && name[1] == '\0') {
+ mCode = code;
}
- }
- if (code < 256) {
- w = ((Gfx8BitFont *)font)->getWidth(code);
- if (w != 0) {
- // 600 is a generic average 'm' width -- yes, this is a hack
- fontSize *= w / 0.6;
+ if (letterCode < 0 && name && name[1] == '\0' &&
+ ((name[0] >= 'A' && name[0] <= 'Z') ||
+ (name[0] >= 'a' && name[0] <= 'z'))) {
+ letterCode = code;
}
+ if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) {
+ anyCode = code;
+ }
+ }
+ if (mCode >= 0 &&
+ (w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) {
+ // 0.6 is a generic average 'm' width -- yes, this is a hack
+ fontSize *= w / 0.6;
+ } else if (letterCode >= 0 &&
+ (w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) {
+ // even more of a hack: 0.5 is a generic letter width
+ fontSize *= w / 0.5;
+ } else if (anyCode >= 0 &&
+ (w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) {
+ // better than nothing: 0.5 is a generic character width
+ fontSize *= w / 0.5;
}
fm = font->getFontMatrix();
if (fm[0] != 0) {
@@ -136,7 +239,7 @@ void TextPage::updateFont(GfxState *state) {
}
}
-void TextPage::beginString(GfxState *state) {
+void TextPage::beginString(GfxState *state, double x0, double y0) {
// This check is needed because Type 3 characters can contain
// text-drawing operations.
if (curStr) {
@@ -144,7 +247,7 @@ void TextPage::beginString(GfxState *state) {
return;
}
- curStr = new TextString(state, fontSize);
+ curStr = new TextString(state, x0, y0, fontSize);
}
void TextPage::addChar(GfxState *state, double x, double y,
@@ -153,17 +256,33 @@ void TextPage::addChar(GfxState *state, double x, double y,
int n, i;
state->transform(x, y, &x1, &y1);
- n = curStr->len;
- if (n > 0 &&
- x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) {
- endString();
- beginString(state);
+ if (x1 < 0 || x1 > state->getPageWidth() ||
+ y1 < 0 || y1 > state->getPageHeight()) {
+ return;
}
state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
+ if (!globalParams->getTextKeepTinyChars() &&
+ fabs(w1) < 3 && fabs(h1) < 3) {
+ if (++nTinyChars > 20000) {
+ return;
+ }
+ }
+ n = curStr->len;
+ if (n > 0 && x1 - curStr->xRight[n-1] >
+ 0.1 * (curStr->yMax - curStr->yMin)) {
+ // large char spacing is sometimes used to move text around
+ endString();
+ beginString(state, x, y);
+ }
+ if (uLen == 1 && u[0] == (Unicode)0x20 &&
+ w1 > 0.5 * (curStr->yMax - curStr->yMin)) {
+ // large word spacing is sometimes used to move text around
+ return;
+ }
if (uLen != 0) {
w1 /= uLen;
h1 /= uLen;
@@ -174,9 +293,6 @@ void TextPage::addChar(GfxState *state, double x, double y,
}
void TextPage::endString() {
- TextString *p1, *p2;
- double h, y1, y2;
-
// This check is needed because Type 3 characters can contain
// text-drawing operations.
if (nest > 0) {
@@ -184,58 +300,72 @@ void TextPage::endString() {
return;
}
+ addString(curStr);
+ curStr = NULL;
+}
+
+void TextPage::addString(TextString *str) {
+ TextString *p1, *p2;
+
// throw away zero-length strings -- they don't have valid xMin/xMax
// values, and they're useless anyway
- if (curStr->len == 0) {
- delete curStr;
- curStr = NULL;
+ if (str->len == 0) {
+ delete str;
return;
}
- // insert string in y-major list
- h = curStr->yMax - curStr->yMin;
- y1 = curStr->yMin + 0.5 * h;
- y2 = curStr->yMin + 0.8 * h;
+ // insert string in xy list
if (rawOrder) {
- p1 = yxCur1;
+ p1 = xyCur1;
p2 = NULL;
- } else if ((!yxCur1 ||
- (y1 >= yxCur1->yMin &&
- (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) &&
- (!yxCur2 ||
- (y1 < yxCur2->yMin ||
- (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) {
- p1 = yxCur1;
- p2 = yxCur2;
+ } else if ((!xyCur1 || xyBefore(xyCur1, str)) &&
+ (!xyCur2 || xyBefore(str, xyCur2))) {
+ p1 = xyCur1;
+ p2 = xyCur2;
+ } else if (xyCur1 && xyBefore(xyCur1, str)) {
+ for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) {
+ if (xyBefore(str, p2)) {
+ break;
+ }
+ }
+ xyCur2 = p2;
} else {
- for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) {
- if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin)) {
+ for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) {
+ if (xyBefore(str, p2)) {
break;
}
}
- yxCur2 = p2;
+ xyCur2 = p2;
}
- yxCur1 = curStr;
+ xyCur1 = str;
if (p1) {
- p1->yxNext = curStr;
+ p1->next = str;
} else {
- yxStrings = curStr;
+ xyStrings = str;
}
- curStr->yxNext = p2;
- curStr = NULL;
+ str->next = p2;
}
void TextPage::coalesce() {
- TextString *str1, *str2;
- double space, d;
- GBool addSpace;
- int n, i;
+ TextLine *line, *line0;
+ TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2;
+ TextString *str0, *str1, *str2, *str3, *str4;
+ TextString *str1prev, *str2prev, *str3prev;
+ TextOutColumnEdge *edges;
+ UnicodeMap *uMap;
+ GBool isUnicode;
+ char buf[8];
+ int edgesLength, edgesSize;
+ double x, yMin, yMax;
+ double space, fit1, fit2, h;
+ int col1, col2, d;
+ int i, j;
#if 0 //~ for debugging
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- printf("x=%3d..%3d y=%3d..%3d size=%2d '",
- (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax,
- (int)(str1->yMax - str1->yMin));
+ for (str1 = xyStrings; str1; str1 = str1->next) {
+ printf("x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
+ str1->xMin, str1->xMax, str1->yMin, str1->yMax,
+ (str1->yMax - str1->yMin));
for (i = 0; i < str1->len; ++i) {
fputc(str1->text[i] & 0xff, stdout);
}
@@ -243,123 +373,493 @@ void TextPage::coalesce() {
}
printf("\n------------------------------------------------------------\n\n");
#endif
- str1 = yxStrings;
- while (str1 && (str2 = str1->yxNext)) {
- space = str1->yMax - str1->yMin;
- d = str2->xMin - str1->xMax;
- if (((rawOrder &&
- ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) ||
- (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) ||
- (!rawOrder && str2->yMin < str1->yMax)) &&
- d > -0.5 * space && d < space) {
- n = str1->len + str2->len;
- if ((addSpace = d > 0.1 * space)) {
- ++n;
- }
- str1->size = (n + 15) & ~15;
- str1->text = (Unicode *)grealloc(str1->text,
- str1->size * sizeof(Unicode));
- str1->xRight = (double *)grealloc(str1->xRight,
- str1->size * sizeof(double));
- if (addSpace) {
- str1->text[str1->len] = 0x20;
- str1->xRight[str1->len] = str2->xMin;
- ++str1->len;
- }
- for (i = 0; i < str2->len; ++i) {
- str1->text[str1->len] = str2->text[i];
- str1->xRight[str1->len] = str2->xRight[i];
- ++str1->len;
- }
- if (str2->xMax > str1->xMax) {
- str1->xMax = str2->xMax;
- }
- if (str2->yMax > str1->yMax) {
- str1->yMax = str2->yMax;
- }
- str1->yxNext = str2->yxNext;
- delete str2;
+
+ // build the list of column edges
+ edges = NULL;
+ edgesLength = edgesSize = 0;
+ if (!rawOrder) {
+ for (str1prev = NULL, str1 = xyStrings;
+ str1;
+ str1prev = str1, str1 = str1->next) {
+ if (str1->marked) {
+ continue;
+ }
+ h = str1->yMax - str1->yMin;
+ if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) {
+ continue;
+ }
+ x = str1->xMin;
+ yMin = str1->yMin;
+ yMax = str1->yMax;
+ for (str2prev = str1, str2 = str1->next;
+ str2;
+ str2prev = str2, str2 = str2->next) {
+ h = str2->yMax - str2->yMin;
+ if (!str2->marked &&
+ (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
+ fabs(str2->xMin - x) < 0.5 &&
+ str2->yMin - yMax < 0.3 * h &&
+ yMin - str2->yMax < 0.3 * h) {
+ break;
+ }
+ }
+ if (str2) {
+ if (str2->yMin < yMin) {
+ yMin = str2->yMin;
+ }
+ if (str2->yMax > yMax) {
+ yMax = str2->yMax;
+ }
+ str2->marked = gTrue;
+ for (str3prev = str1, str3 = str1->next;
+ str3;
+ str3prev = str3, str3 = str3->next) {
+ h = str3->yMax - str3->yMin;
+ if (!str3->marked &&
+ (str3->xMin - str3prev->xMax) / h > textOutColSpace &&
+ fabs(str3->xMin - x) < 0.5 &&
+ str3->yMin - yMax < 0.3 * h &&
+ yMin - str3->yMax < 0.3 * h) {
+ break;
+ }
+ }
+ if (str3) {
+ if (str3->yMin < yMin) {
+ yMin = str3->yMin;
+ }
+ if (str3->yMax > yMax) {
+ yMax = str3->yMax;
+ }
+ str3->marked = gTrue;
+ do {
+ for (str2prev = str1, str2 = str1->next;
+ str2;
+ str2prev = str2, str2 = str2->next) {
+ h = str2->yMax - str2->yMin;
+ if (!str2->marked &&
+ (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
+ fabs(str2->xMin - x) < 0.5 &&
+ str2->yMin - yMax < 0.3 * h &&
+ yMin - str2->yMax < 0.3 * h) {
+ if (str2->yMin < yMin) {
+ yMin = str2->yMin;
+ }
+ if (str2->yMax > yMax) {
+ yMax = str2->yMax;
+ }
+ str2->marked = gTrue;
+ break;
+ }
+ }
+ } while (str2);
+ if (edgesLength == edgesSize) {
+ edgesSize = edgesSize ? 2 * edgesSize : 16;
+ edges = (TextOutColumnEdge *)
+ grealloc(edges, edgesSize * sizeof(TextOutColumnEdge));
+ }
+ edges[edgesLength].x = x;
+ edges[edgesLength].y0 = yMin;
+ edges[edgesLength].y1 = yMax;
+ ++edgesLength;
+ } else {
+ str2->marked = gFalse;
+ }
+ }
+ str1->marked = gTrue;
+ }
+ }
+
+#if 0 //~ for debugging
+ printf("column edges:\n");
+ for (i = 0; i < edgesLength; ++i) {
+ printf("%d: x=%.2f y0=%.2f y1=%.2f\n",
+ i, edges[i].x, edges[i].y0, edges[i].y1);
+ }
+ printf("\n------------------------------------------------------------\n\n");
+#endif
+
+ // build the blocks
+ yxBlocks = NULL;
+ blk1 = blk2 = NULL;
+ while (xyStrings) {
+
+ // build the block
+ str0 = xyStrings;
+ xyStrings = xyStrings->next;
+ str0->next = NULL;
+ blk = new TextBlock();
+ blk->strings = str0;
+ blk->xMin = str0->xMin;
+ blk->xMax = str0->xMax;
+ blk->yMin = str0->yMin;
+ blk->yMax = str0->yMax;
+ while (xyStrings) {
+ str1 = NULL;
+ str2 = xyStrings;
+ fit1 = coalesceFit(str0, str2);
+ if (!rawOrder) {
+ // look for best-fitting string
+ space = str0->yMax - str0->yMin;
+ for (str3 = xyStrings, str4 = xyStrings->next;
+ str4 && str4->xMin - str0->xMax <= space;
+ str3 = str4, str4 = str4->next) {
+ fit2 = coalesceFit(str0, str4);
+ if (fit2 < fit1) {
+ str1 = str3;
+ str2 = str4;
+ fit1 = fit2;
+ }
+ }
+ }
+ if (fit1 > 1) {
+ // no fit - we're done with this block
+ break;
+ }
+
+ // if we've hit a column edge we're done with this block
+ if (fit1 > 0.2) {
+ for (i = 0; i < edgesLength; ++i) {
+ if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin &&
+ str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 &&
+ str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) {
+ break;
+ }
+ }
+ if (i < edgesLength) {
+ break;
+ }
+ }
+
+ if (str1) {
+ str1->next = str2->next;
+ } else {
+ xyStrings = str2->next;
+ }
+ str0->next = str2;
+ str2->next = NULL;
+ if (str2->xMax > blk->xMax) {
+ blk->xMax = str2->xMax;
+ }
+ if (str2->yMin < blk->yMin) {
+ blk->yMin = str2->yMin;
+ }
+ if (str2->yMax > blk->yMax) {
+ blk->yMax = str2->yMax;
+ }
+ str0 = str2;
+ }
+
+ // insert block on list
+ if (!rawOrder) {
+ // insert block on list in yx order
+ for (blk1 = NULL, blk2 = yxBlocks;
+ blk2 && !yxBefore(blk, blk2);
+ blk1 = blk2, blk2 = blk2->next) ;
+ }
+ blk->next = blk2;
+ if (blk1) {
+ blk1->next = blk;
+ } else {
+ yxBlocks = blk;
+ }
+ blk1 = blk;
+ }
+
+ gfree(edges);
+
+ // the strings are now owned by the lines/blocks tree
+ xyStrings = NULL;
+
+ // build the block text
+ uMap = globalParams->getTextEncoding();
+ isUnicode = uMap ? uMap->isUnicode() : gFalse;
+ for (blk = yxBlocks; blk; blk = blk->next) {
+ blk->len = 0;
+ for (str1 = blk->strings; str1; str1 = str1->next) {
+ blk->len += str1->len;
+ if (str1->next && str1->next->xMin - str1->xMax >
+ textOutSpace * (str1->yMax - str1->yMin)) {
+ str1->spaceAfter = gTrue;
+ ++blk->len;
+ } else {
+ str1->spaceAfter = gFalse;
+ }
+ }
+ blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode));
+ blk->xRight = (double *)gmalloc(blk->len * sizeof(double));
+ blk->col = (int *)gmalloc(blk->len * sizeof(int));
+ i = 0;
+ for (str1 = blk->strings; str1; str1 = str1->next) {
+ for (j = 0; j < str1->len; ++j) {
+ blk->text[i] = str1->text[j];
+ blk->xRight[i] = str1->xRight[j];
+ ++i;
+ }
+ if (str1->spaceAfter) {
+ blk->text[i] = (Unicode)0x0020;
+ blk->xRight[i] = str1->next->xMin;
+ ++i;
+ }
+ }
+ blk->convertedLen = 0;
+ for (j = 0; j < blk->len; ++j) {
+ blk->col[j] = blk->convertedLen;
+ if (isUnicode) {
+ ++blk->convertedLen;
+ } else if (uMap) {
+ blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf));
+ }
+ }
+ }
+ if (uMap) {
+ uMap->decRefCnt();
+ }
+
+#if 0 //~ for debugging
+ for (blk = yxBlocks; blk; blk = blk->next) {
+ printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
+ blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
+ TextString *str;
+ for (str = blk->strings; str; str = str->next) {
+ printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f'",
+ str->xMin, str->xMax, str->yMin, str->yMax,
+ (str->yMax - str->yMin));
+ for (i = 0; i < str->len; ++i) {
+ fputc(str->text[i] & 0xff, stdout);
+ }
+ if (str->spaceAfter) {
+ fputc(' ', stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n------------------------------------------------------------\n\n");
+#endif
+
+ // build the lines
+ lines = NULL;
+ line0 = NULL;
+ while (yxBlocks) {
+ blk0 = yxBlocks;
+ yxBlocks = yxBlocks->next;
+ blk0->next = NULL;
+ line = new TextLine();
+ line->blocks = blk0;
+ line->yMin = blk0->yMin;
+ line->yMax = blk0->yMax;
+ while (yxBlocks) {
+
+ // remove duplicated text (fake boldface, shadowed text)
+ h = blk0->yMax - blk0->yMin;
+ if (yxBlocks->len == blk0->len &&
+ !memcmp(yxBlocks->text, blk0->text,
+ yxBlocks->len * sizeof(Unicode)) &&
+ fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 &&
+ fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 &&
+ fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 &&
+ fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) {
+ blk1 = yxBlocks;
+ yxBlocks = yxBlocks->next;
+ delete blk1;
+ continue;
+ }
+
+ if (rawOrder && yxBlocks->yMax < blk0->yMin) {
+ break;
+ }
+ if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax ||
+ yxBlocks->xMin < blk0->xMax) {
+ break;
+ }
+ blk1 = yxBlocks;
+ yxBlocks = yxBlocks->next;
+ blk0->next = blk1;
+ blk1->next = NULL;
+ if (blk1->yMin < line->yMin) {
+ line->yMin = blk1->yMin;
+ }
+ if (blk1->yMax > line->yMax) {
+ line->yMax = blk1->yMax;
+ }
+ blk0 = blk1;
+ }
+ if (line0) {
+ line0->next = line;
} else {
- str1 = str2;
+ lines = line;
+ }
+ line->next = NULL;
+ line0 = line;
+ }
+
+
+ // sort the blocks into xy order
+ xyBlocks = NULL;
+ for (line = lines; line; line = line->next) {
+ for (blk = line->blocks; blk; blk = blk->next) {
+ for (blk1 = NULL, blk2 = xyBlocks;
+ blk2 && !xyBefore(blk, blk2);
+ blk1 = blk2, blk2 = blk2->xyNext) ;
+ blk->xyNext = blk2;
+ if (blk1) {
+ blk1->xyNext = blk;
+ } else {
+ xyBlocks = blk;
+ }
+ }
+ }
+
+#if 0 //~ for debugging
+ for (blk = xyBlocks; blk; blk = blk->xyNext) {
+ printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
+ blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
+ TextString *str;
+ for (str = blk->strings; str; str = str->next) {
+ printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
+ str->xMin, str->xMax, str->yMin, str->yMax,
+ (str->yMax - str->yMin));
+ for (i = 0; i < str->len; ++i) {
+ fputc(str->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n------------------------------------------------------------\n\n");
+#endif
+
+ // do column assignment
+ for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) {
+ col1 = 0;
+ for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) {
+ if (blk1->xMin >= blk2->xMax) {
+ d = (int)((blk1->xMin - blk2->xMax) /
+ (0.4 * (blk1->yMax - blk1->yMin)));
+ if (d > 4) {
+ d = 4;
+ }
+ col2 = blk2->col[0] + blk2->convertedLen + d;
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ } else if (blk1->xMin > blk2->xMin) {
+ for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ;
+ col2 = blk2->col[i];
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ }
+ for (j = 0; j < blk1->len; ++j) {
+ blk1->col[j] += col1;
}
}
+
+#if 0 //~ for debugging
+ for (line = lines; line; line = line->next) {
+ printf("[line]\n");
+ for (blk = line->blocks; blk; blk = blk->next) {
+ printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len);
+ TextString *str;
+ for (str = blk->strings; str; str = str->next) {
+ printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
+ str->xMin, str->xMax, str->yMin, str->yMax,
+ (str->yMax - str->yMin));
+ for (i = 0; i < str->len; ++i) {
+ fputc(str->text[i] & 0xff, stdout);
+ }
+ if (str->spaceAfter) {
+ printf(" [space]\n");
+ }
+ printf("'\n");
+ }
+ }
+ }
+ printf("\n------------------------------------------------------------\n\n");
+#endif
}
+
GBool TextPage::findText(Unicode *s, int len,
GBool top, GBool bottom,
double *xMin, double *yMin,
double *xMax, double *yMax) {
- TextString *str;
+ TextLine *line;
+ TextBlock *blk;
Unicode *p;
Unicode u1, u2;
int m, i, j;
- double x;
+ double x0, x1, x;
- // scan all strings on page
- for (str = yxStrings; str; str = str->yxNext) {
-
- // check: above top limit?
- if (!top && (str->yMax < *yMin ||
- (str->yMin < *yMin && str->xMax <= *xMin))) {
- continue;
- }
-
- // check: below bottom limit?
- if (!bottom && (str->yMin > *yMax ||
- (str->yMax > *yMax && str->xMin >= *xMax))) {
- return gFalse;
- }
-
- // search each position in this string
- m = str->len;
- for (i = 0, p = str->text; i <= m - len; ++i, ++p) {
+ // scan all blocks on page
+ for (line = lines; line; line = line->next) {
+ for (blk = line->blocks; blk; blk = blk->next) {
// check: above top limit?
- if (!top && str->yMin < *yMin) {
- x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
- if (x < *xMin) {
- continue;
- }
+ if (!top && (blk->yMax < *yMin ||
+ (blk->yMin < *yMin && blk->xMax <= *xMin))) {
+ continue;
}
// check: below bottom limit?
- if (!bottom && str->yMax > *yMax) {
- x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2;
- if (x > *xMax) {
- return gFalse;
- }
+ if (!bottom && (blk->yMin > *yMax ||
+ (blk->yMax > *yMax && blk->xMin >= *xMax))) {
+ return gFalse;
}
- // compare the strings
- for (j = 0; j < len; ++j) {
-#if 1 //~ this lowercases Latin A-Z only -- this will eventually be
- //~ extended to handle other character sets
- if (p[j] >= 0x41 && p[j] <= 0x5a) {
- u1 = p[j] + 0x20;
- } else {
- u1 = p[j];
+ // search each position in this block
+ m = blk->len;
+ for (i = 0, p = blk->text; i <= m - len; ++i, ++p) {
+
+ x0 = (i == 0) ? blk->xMin : blk->xRight[i-1];
+ x1 = blk->xRight[i];
+ x = 0.5 * (x0 + x1);
+
+ // check: above top limit?
+ if (!top && blk->yMin < *yMin) {
+ if (x < *xMin) {
+ continue;
+ }
}
- if (s[j] >= 0x41 && s[j] <= 0x5a) {
- u2 = s[j] + 0x20;
- } else {
- u2 = s[j];
+
+ // check: below bottom limit?
+ if (!bottom && blk->yMax > *yMax) {
+ if (x > *xMax) {
+ return gFalse;
+ }
}
+
+ // compare the strings
+ for (j = 0; j < len; ++j) {
+#if 1 //~ this lowercases Latin A-Z only -- this will eventually be
+ //~ extended to handle other character sets
+ if (p[j] >= 0x41 && p[j] <= 0x5a) {
+ u1 = p[j] + 0x20;
+ } else {
+ u1 = p[j];
+ }
+ if (s[j] >= 0x41 && s[j] <= 0x5a) {
+ u2 = s[j] + 0x20;
+ } else {
+ u2 = s[j];
+ }
#endif
- if (u1 != u2) {
- break;
+ if (u1 != u2) {
+ break;
+ }
}
- }
- // found it
- if (j == len) {
- *xMin = (i == 0) ? str->xMin : str->xRight[i-1];
- *xMax = str->xRight[i + len - 1];
- *yMin = str->yMin;
- *yMax = str->yMax;
- return gTrue;
+ // found it
+ if (j == len) {
+ *xMin = x0;
+ *xMax = blk->xRight[i + len - 1];
+ *yMin = blk->yMin;
+ *yMax = blk->yMax;
+ return gTrue;
+ }
}
}
}
+
return gFalse;
}
@@ -367,18 +867,22 @@ GString *TextPage::getText(double xMin, double yMin,
double xMax, double yMax) {
GString *s;
UnicodeMap *uMap;
+ GBool isUnicode;
char space[8], eol[16], buf[8];
- int spaceLen, eolLen, n;
- TextString *str1;
- double x0, x1, x2, y;
- double xPrev, yPrev;
- int i1, i2, i;
+ int spaceLen, eolLen, len;
+ TextLine *line;
+ TextBlock *blk;
+ double x0, x1, y;
+ int firstCol, col, i;
GBool multiLine;
s = new GString();
+
+ // get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
return s;
}
+ isUnicode = uMap->isUnicode();
spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
eolLen = 0; // make gcc happy
switch (globalParams->getTextEOL()) {
@@ -393,61 +897,126 @@ GString *TextPage::getText(double xMin, double yMin,
eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
break;
}
- xPrev = yPrev = 0;
+
+ // find the leftmost column
multiLine = gFalse;
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- y = 0.5 * (str1->yMin + str1->yMax);
- if (y > yMax) {
+ firstCol = -1;
+ for (line = lines; line; line = line->next) {
+ if (line->yMin > yMax) {
break;
}
- if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) {
- x0 = x1 = x2 = str1->xMin;
- for (i1 = 0; i1 < str1->len; ++i1) {
- x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1];
- x1 = str1->xRight[i1];
- if (0.5 * (x0 + x1) >= xMin) {
- break;
- }
+ if (line->yMax < yMin) {
+ continue;
+ }
+
+ for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
+ if (!blk || blk->xMin > xMax) {
+ continue;
+ }
+
+ y = 0.5 * (blk->yMin + blk->yMax);
+ if (y < yMin || y > yMax) {
+ continue;
+ }
+
+ if (firstCol >= 0) {
+ multiLine = gTrue;
+ }
+
+ i = 0;
+ while (1) {
+ x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
+ x1 = blk->xRight[i];
+ if (0.5 * (x0 + x1) > xMin) {
+ break;
}
- for (i2 = str1->len - 1; i2 > i1; --i2) {
- x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1];
- x2 = str1->xRight[i2];
- if (0.5 * (x1 + x2) <= xMax) {
- break;
- }
+ ++i;
+ }
+ col = blk->col[i];
+
+ if (firstCol < 0 || col < firstCol) {
+ firstCol = col;
+ }
+ }
+
+ // extract the text
+ for (line = lines; line; line = line->next) {
+ if (line->yMin > yMax) {
+ break;
+ }
+ if (line->yMax < yMin) {
+ continue;
+ }
+
+ for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
+ if (!blk || blk->xMin > xMax) {
+ continue;
+ }
+
+ y = 0.5 * (blk->yMin + blk->yMax);
+ if (y < yMin || y > yMax) {
+ continue;
+ }
+
+ i = 0;
+ while (1) {
+ x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
+ x1 = blk->xRight[i];
+ if (0.5 * (x0 + x1) > xMin) {
+ break;
}
- if (s->getLength() > 0) {
- if (x0 < xPrev || str1->yMin > yPrev) {
- s->append(eol, eolLen);
- multiLine = gTrue;
- } else {
- for (i = 0; i < 4; ++i) {
- s->append(space, spaceLen);
- }
+ ++i;
+ }
+
+ col = firstCol;
+
+ do {
+
+ // line this block up with the correct column
+ for (; col < blk->col[i]; ++col) {
+ s->append(space, spaceLen);
+ }
+
+ // print the block
+ for (; i < blk->len; ++i) {
+
+ x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
+ x1 = blk->xRight[i];
+ if (0.5 * (x0 + x1) > xMax) {
+ break;
}
+
+ len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
+ s->append(buf, len);
+ col += isUnicode ? 1 : len;
}
- for (i = i1; i <= i2; ++i) {
- n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf));
- s->append(buf, n);
+ if (i < blk->len) {
+ break;
}
- xPrev = x2;
- yPrev = str1->yMax;
+
+ // next block
+ blk = blk->next;
+ i = 0;
+
+ } while (blk && blk->xMin < xMax);
+
+ if (multiLine) {
+ s->append(eol, eolLen);
}
}
- if (multiLine) {
- s->append(eol, eolLen);
- }
+
uMap->decRefCnt();
+
return s;
}
void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
UnicodeMap *uMap;
char space[8], eol[16], eop[8], buf[8];
- int spaceLen, eolLen, eopLen, n;
- TextString *str1, *str2, *str3;
- double yMin, yMax;
- int col1, col2, d, i;
+ int spaceLen, eolLen, eopLen, len;
+ TextLine *line;
+ TextBlock *blk;
+ int col, d, i;
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
@@ -469,129 +1038,46 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
}
eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
- // build x-major list
- xyStrings = NULL;
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- for (str2 = NULL, str3 = xyStrings;
- str3;
- str2 = str3, str3 = str3->xyNext) {
- if (str1->xMin < str3->xMin ||
- (str1->xMin == str3->xMin && str1->yMin < str3->yMin)) {
- break;
- }
- }
- if (str2) {
- str2->xyNext = str1;
- } else {
- xyStrings = str1;
- }
- str1->xyNext = str3;
- }
-
- // do column assignment
- for (str1 = xyStrings; str1; str1 = str1->xyNext) {
- col1 = 0;
- for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) {
- if (str1->xMin >= str2->xMax) {
- col2 = str2->col + str2->len + 4;
- if (col2 > col1) {
- col1 = col2;
- }
- } else if (str1->xMin > str2->xMin) {
- col2 = str2->col +
- (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) *
- str2->len);
- if (col2 > col1) {
- col1 = col2;
+ // output
+ for (line = lines; line; line = line->next) {
+ col = 0;
+ for (blk = line->blocks; blk; blk = blk->next) {
+
+ // line this block up with the correct column
+ if (rawOrder && col == 0) {
+ col = blk->col[0];
+ } else {
+ for (; col < blk->col[0]; ++col) {
+ (*outputFunc)(outputStream, space, spaceLen);
}
}
- }
- str1->col = col1;
- }
-
-#if 0 //~ for debugging
- fprintf((FILE *)outputStream, "~~~~~~~~~~\n");
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- fprintf((FILE *)outputStream, "(%4d,%4d) - (%4d,%4d) [%3d] '",
- (int)str1->xMin, (int)str1->yMin,
- (int)str1->xMax, (int)str1->yMax, str1->col);
- for (i = 0; i < str1->len; ++i) {
- fputc(str1->text[i] & 0xff, stdout);
- }
- printf("'\n");
- }
- fprintf((FILE *)outputStream, "~~~~~~~~~~\n");
-#endif
-
- // output
- col1 = 0;
- yMax = yxStrings ? yxStrings->yMax : 0;
- for (str1 = yxStrings; str1; str1 = str1->yxNext) {
- // line this string up with the correct column
- if (rawOrder && col1 == 0) {
- col1 = str1->col;
- } else {
- for (; col1 < str1->col; ++col1) {
- (*outputFunc)(outputStream, space, spaceLen);
+ // print the block
+ for (i = 0; i < blk->len; ++i) {
+ len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
+ (*outputFunc)(outputStream, buf, len);
}
+ col += blk->convertedLen;
}
- // print the string
- for (i = 0; i < str1->len; ++i) {
- if ((n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf))) > 0) {
- (*outputFunc)(outputStream, buf, n);
+ // print a return
+ (*outputFunc)(outputStream, eol, eolLen);
+
+ // print extra vertical space if necessary
+ if (line->next) {
+ d = (int)((line->next->yMin - line->yMax) /
+ (line->blocks->strings->yMax - lines->blocks->strings->yMin)
+ + 0.5);
+ // various things (weird font matrices) can result in bogus
+ // values here, so do a sanity check
+ if (rawOrder && d > 2) {
+ d = 2;
+ } else if (!rawOrder && d > 5) {
+ d = 5;
}
- }
-
- // increment column
- col1 += str1->len;
-
- // update yMax for this line
- if (str1->yMax > yMax) {
- yMax = str1->yMax;
- }
-
- // if we've hit the end of the line...
- if (!(str1->yxNext &&
- !(rawOrder && str1->yxNext->yMax < str1->yMin) &&
- str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax &&
- str1->yxNext->xMin >= str1->xMax)) {
-
- // print a return
- (*outputFunc)(outputStream, eol, eolLen);
-
- // print extra vertical space if necessary
- if (str1->yxNext) {
-
- // find yMin for next line
- yMin = str1->yxNext->yMin;
- for (str2 = str1->yxNext; str2; str2 = str2->yxNext) {
- if (str2->yMin < yMin) {
- yMin = str2->yMin;
- }
- if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax &&
- str2->yxNext->xMin >= str2->xMax))
- break;
- }
-
- // print the space
- d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5);
- // various things (weird font matrices) can result in bogus
- // values here, so do a sanity check
- if (rawOrder && d > 2) {
- d = 2;
- } else if (!rawOrder && d > 5) {
- d = 5;
- }
- for (; d > 0; --d) {
- (*outputFunc)(outputStream, eol, eolLen);
- }
+ for (; d > 0; --d) {
+ (*outputFunc)(outputStream, eol, eolLen);
}
-
- // set up for next line
- col1 = 0;
- yMax = str1->yxNext ? str1->yxNext->yMax : 0;
}
}
@@ -603,20 +1089,89 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
uMap->decRefCnt();
}
+// Returns true if <str1> should be inserted before <str2> in xy
+// order.
+GBool TextPage::xyBefore(TextString *str1, TextString *str2) {
+ return str1->xMin < str2->xMin ||
+ (str1->xMin == str2->xMin && str1->yMin < str2->yMin);
+}
+
+// Returns true if <blk1> should be inserted before <blk2> in xy
+// order.
+GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) {
+ return blk1->xMin < blk2->xMin ||
+ (blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin);
+}
+
+// Returns true if <blk1> should be inserted before <blk2> in yx
+// order, allowing a little slack for vertically overlapping text.
+GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) {
+ double h1, h2, overlap;
+
+ h1 = blk1->yMax - blk1->yMin;
+ h2 = blk2->yMax - blk2->yMin;
+ overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) -
+ (blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) /
+ (h1 < h2 ? h1 : h2);
+ if (overlap > 0.6) {
+ return blk1->xMin < blk2->xMin;
+ }
+ return blk1->yMin < blk2->yMin;
+}
+
+double TextPage::coalesceFit(TextString *str1, TextString *str2) {
+ double h1, h2, w1, w2, r, overlap, spacing;
+
+ h1 = str1->yMax - str1->yMin;
+ h2 = str2->yMax - str2->yMin;
+ w1 = str1->xMax - str1->xMin;
+ w2 = str2->xMax - str2->xMin;
+ r = h1 / h2;
+ if (r < (1.0 / 3.0) || r > 3) {
+ return 10;
+ }
+ overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) -
+ (str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) /
+ (h1 < h2 ? h1 : h2);
+ if (overlap < 0.5) {
+ return 10;
+ }
+ spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2);
+ if (spacing < -0.5) {
+ return 10;
+ }
+ // separate text that overlaps - duplicated text (so that fake
+ // boldface and shadowed text can be cleanly removed)
+ if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) {
+ return 10;
+ }
+ return spacing;
+}
+
void TextPage::clear() {
- TextString *p1, *p2;
+ TextLine *p1, *p2;
+ TextString *s1, *s2;
if (curStr) {
delete curStr;
curStr = NULL;
}
- for (p1 = yxStrings; p1; p1 = p2) {
- p2 = p1->yxNext;
- delete p1;
+ if (lines) {
+ for (p1 = lines; p1; p1 = p2) {
+ p2 = p1->next;
+ delete p1;
+ }
+ } else if (xyStrings) {
+ for (s1 = xyStrings; s1; s1 = s2) {
+ s2 = s1->next;
+ delete s1;
+ }
}
- yxStrings = NULL;
xyStrings = NULL;
- yxCur1 = yxCur2 = NULL;
+ xyCur1 = xyCur2 = NULL;
+ lines = NULL;
+ nest = 0;
+ nTinyChars = 0;
}
//------------------------------------------------------------------------
@@ -691,7 +1246,7 @@ void TextOutputDev::updateFont(GfxState *state) {
}
void TextOutputDev::beginString(GfxState *state, GString *s) {
- text->beginString(state);
+ text->beginString(state, state->getCurX(), state->getCurY());
}
void TextOutputDev::endString(GfxState *state) {
@@ -711,3 +1266,9 @@ GBool TextOutputDev::findText(Unicode *s, int len,
double *xMax, double *yMax) {
return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
}
+
+GString *TextOutputDev::getText(double xMin, double yMin,
+ double xMax, double yMax) {
+ return text->getText(xMin, yMin, xMax, yMax);
+}
+