diff options
author | Martin Kretzschmar <mkretzschmar@src.gnome.org> | 2003-03-31 21:08:43 (GMT) |
---|---|---|
committer | Martin Kretzschmar <mkretzschmar@src.gnome.org> | 2003-03-31 21:08:43 (GMT) |
commit | 64676031423465996e83c4a685290f0c3d97a249 (patch) | |
tree | cef578bc8c300722abb3fe1693181e68895a94a7 /pdf/xpdf/TextOutputDev.cc | |
parent | 28c37dbcf87665a4ccec58bef9ef8ff0697022dd (diff) |
kill traces of ltk, incorporate new sources
* xpdf/Makefile.am: kill traces of ltk, incorporate new sources
* Makefile.am, configure.in: don't build the ltk subdir
* ANNOUNCE, CHANGES, ChangeLog, README, aconf-dj.h, aconf-win32.h,
dj_make.bat, ms_make.bat, vms_make.com: update
* xpdf/LTKOutputDev.cc, xpdf/LTKOutputDev.h, xpdf/postscript.xbm,
xpdf/xpdf-flip.ltk, xpdf/xpdf-ltk.h, xpdf/xpdf-top.ltk,
xpdf/xpdf.ltk: remove.
* xpdf/Annot.cc, xpdf/Annot.h, xpdf/Array.cc, xpdf/Array.h,
xpdf/BuiltinFont.cc, xpdf/BuiltinFont.h,
xpdf/BuiltinFontTables.cc, xpdf/CMap.cc, xpdf/CMap.h,
xpdf/Catalog.cc, xpdf/Catalog.h, xpdf/CharCodeToUnicode.cc,
xpdf/CharCodeToUnicode.h, xpdf/Decrypt.cc, xpdf/Decrypt.h,
xpdf/Dict.cc, xpdf/Dict.h, xpdf/Error.cc, xpdf/Error.h,
xpdf/FTFont.cc, xpdf/FTFont.h, xpdf/FontFile.cc, xpdf/FontFile.h,
xpdf/Function.cc, xpdf/Function.h, xpdf/Gfx.cc, xpdf/Gfx.h,
xpdf/GfxFont.cc, xpdf/GfxFont.h, xpdf/GfxState.cc,
xpdf/GfxState.h, xpdf/GlobalParams.cc, xpdf/GlobalParams.h,
xpdf/ImageOutputDev.cc, xpdf/ImageOutputDev.h, xpdf/Lexer.cc,
xpdf/Lexer.h, xpdf/Link.cc, xpdf/Link.h, xpdf/NameToCharCode.cc,
xpdf/NameToCharCode.h, xpdf/NameToUnicodeTable.h, xpdf/Object.cc,
xpdf/Object.h, xpdf/OutputDev.cc, xpdf/OutputDev.h,
xpdf/PBMOutputDev.cc, xpdf/PBMOutputDev.h, xpdf/PDFDoc.cc,
xpdf/PDFDoc.h, xpdf/PSOutputDev.cc, xpdf/PSOutputDev.h,
xpdf/PSTokenizer.cc, xpdf/PSTokenizer.h, xpdf/Page.cc,
xpdf/Page.h, xpdf/Parser.cc, xpdf/Parser.h, xpdf/SFont.cc,
xpdf/SFont.h, xpdf/Stream.cc, xpdf/Stream.h, xpdf/T1Font.cc,
xpdf/T1Font.h, xpdf/TTFont.cc, xpdf/TTFont.h,
xpdf/TextOutputDev.cc, xpdf/TextOutputDev.h, xpdf/UnicodeMap.cc,
xpdf/UnicodeMap.h, xpdf/XOutputDev.cc, xpdf/XOutputDev.h,
xpdf/XRef.cc, xpdf/XRef.h, xpdf/config.h, xpdf/pdffonts.cc,
xpdf/pdfimages.cc, xpdf/pdfinfo.cc, xpdf/pdftopbm.cc,
xpdf/pdftops.cc, xpdf/pdftotext.cc, xpdf/vms_make.com,
xpdf/xpdf.cc: update.
* goo/GHash.cc, goo/GHash.h, goo/GList.cc, goo/GList.h,
goo/GString.cc, goo/GString.h: mostly Mac OS X gcc fixage.
* doc/pdffonts.1, doc/pdffonts.cat, doc/pdffonts.hlp,
doc/pdfimages.1, doc/pdfimages.cat, doc/pdfimages.hlp,
doc/pdfinfo.1, doc/pdfinfo.cat, doc/pdfinfo.hlp, doc/pdftopbm.1,
doc/pdftopbm.cat, doc/pdftopbm.hlp, doc/pdftops.1,
doc/pdftops.cat, doc/pdftops.hlp, doc/pdftotext.1,
doc/pdftotext.cat, doc/pdftotext.hlp, doc/xpdf.1, doc/xpdf.cat,
doc/xpdf.hlp, doc/xpdfrc.5, doc/xpdfrc.cat, doc/xpdfrc.hlp:
update docs.
Diffstat (limited to 'pdf/xpdf/TextOutputDev.cc')
-rw-r--r-- | pdf/xpdf/TextOutputDev.cc | 1191 |
1 files changed, 876 insertions, 315 deletions
diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc index 5e5761f..891752c 100644 --- a/pdf/xpdf/TextOutputDev.cc +++ b/pdf/xpdf/TextOutputDev.cc @@ -6,11 +6,12 @@ // //======================================================================== -#ifdef __GNUC__ +#include <aconf.h> + +#ifdef USE_GCC_PRAGMAS #pragma implementation #endif -#include <aconf.h> #include <stdio.h> #include <stdlib.h> #include <stddef.h> @@ -31,14 +32,100 @@ #endif //------------------------------------------------------------------------ + +#define textOutSpace 0.2 +#define textOutColSpace 0.2 + +//------------------------------------------------------------------------ + +struct TextOutColumnEdge { + double x, y0, y1; +}; + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +class TextBlock { +public: + + TextBlock(); + ~TextBlock(); + + double xMin, xMax; + double yMin, yMax; + TextString *strings; // list of strings in the block + TextBlock *next; // next block in line + TextBlock *xyNext; // next block on xyBlocks list + Unicode *text; // Unicode text of the block, including + // spaces between strings + double *xRight; // right-hand x coord of each char + int len; // total number of Unicode characters + int convertedLen; // total number of converted characters + int *col; // starting column number for each + // Unicode character +}; + +TextBlock::TextBlock() { + strings = NULL; + next = NULL; + xyNext = NULL; + text = NULL; + xRight = NULL; + col = NULL; +} + +TextBlock::~TextBlock() { + TextString *p1, *p2; + + for (p1 = strings; p1; p1 = p2) { + p2 = p1->next; + delete p1; + } + gfree(text); + gfree(xRight); + gfree(col); +} + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +class TextLine { +public: + + TextLine(); + ~TextLine(); + + TextBlock *blocks; + TextLine *next; + double yMin, yMax; +}; + +TextLine::TextLine() { + blocks = NULL; + next = NULL; +} + +TextLine::~TextLine() { + TextBlock *p1, *p2; + + for (p1 = blocks; p1; p1 = p2) { + p2 = p1->next; + delete p1; + } +} + +//------------------------------------------------------------------------ // TextString //------------------------------------------------------------------------ -TextString::TextString(GfxState *state, double fontSize) { +TextString::TextString(GfxState *state, double x0, double y0, + double fontSize) { GfxFont *font; double x, y; - state->transform(state->getCurX(), state->getCurY(), &x, &y); + state->transform(x0, y0, &x, &y); if ((font = state->getFont())) { yMin = y - font->getAscent() * fontSize; yMax = y - font->getDescent() * fontSize; @@ -54,14 +141,14 @@ TextString::TextString(GfxState *state, double fontSize) { yMin = y; yMax = y + 1; } - col = 0; + marked = gFalse; text = NULL; xRight = NULL; len = size = 0; - yxNext = NULL; - xyNext = NULL; + next = NULL; } + TextString::~TextString() { gfree(text); gfree(xRight); @@ -90,10 +177,11 @@ TextPage::TextPage(GBool rawOrderA) { rawOrder = rawOrderA; curStr = NULL; fontSize = 0; - yxStrings = NULL; xyStrings = NULL; - yxCur1 = yxCur2 = NULL; + xyCur1 = xyCur2 = NULL; + lines = NULL; nest = 0; + nTinyChars = 0; } TextPage::~TextPage() { @@ -104,7 +192,7 @@ void TextPage::updateFont(GfxState *state) { GfxFont *font; double *fm; char *name; - int code; + int code, mCode, letterCode, anyCode; double w; // adjust the font size @@ -116,18 +204,33 @@ void TextPage::updateFont(GfxState *state) { // rendering the font. This code tries to guess by looking at the // width of the character 'm' (which breaks if the font is a // subset that doesn't contain 'm'). + mCode = letterCode = anyCode = -1; for (code = 0; code < 256; ++code) { - if ((name = ((Gfx8BitFont *)font)->getCharName(code)) && - name[0] == 'm' && name[1] == '\0') { - break; + name = ((Gfx8BitFont *)font)->getCharName(code); + if (name && name[0] == 'm' && name[1] == '\0') { + mCode = code; } - } - if (code < 256) { - w = ((Gfx8BitFont *)font)->getWidth(code); - if (w != 0) { - // 600 is a generic average 'm' width -- yes, this is a hack - fontSize *= w / 0.6; + if (letterCode < 0 && name && name[1] == '\0' && + ((name[0] >= 'A' && name[0] <= 'Z') || + (name[0] >= 'a' && name[0] <= 'z'))) { + letterCode = code; } + if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) { + anyCode = code; + } + } + if (mCode >= 0 && + (w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) { + // 0.6 is a generic average 'm' width -- yes, this is a hack + fontSize *= w / 0.6; + } else if (letterCode >= 0 && + (w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) { + // even more of a hack: 0.5 is a generic letter width + fontSize *= w / 0.5; + } else if (anyCode >= 0 && + (w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) { + // better than nothing: 0.5 is a generic character width + fontSize *= w / 0.5; } fm = font->getFontMatrix(); if (fm[0] != 0) { @@ -136,7 +239,7 @@ void TextPage::updateFont(GfxState *state) { } } -void TextPage::beginString(GfxState *state) { +void TextPage::beginString(GfxState *state, double x0, double y0) { // This check is needed because Type 3 characters can contain // text-drawing operations. if (curStr) { @@ -144,7 +247,7 @@ void TextPage::beginString(GfxState *state) { return; } - curStr = new TextString(state, fontSize); + curStr = new TextString(state, x0, y0, fontSize); } void TextPage::addChar(GfxState *state, double x, double y, @@ -153,17 +256,33 @@ void TextPage::addChar(GfxState *state, double x, double y, int n, i; state->transform(x, y, &x1, &y1); - n = curStr->len; - if (n > 0 && - x1 - curStr->xRight[n-1] > 0.1 * (curStr->yMax - curStr->yMin)) { - endString(); - beginString(state); + if (x1 < 0 || x1 > state->getPageWidth() || + y1 < 0 || y1 > state->getPageHeight()) { + return; } state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(), 0, &dx2, &dy2); dx -= dx2; dy -= dy2; state->transformDelta(dx, dy, &w1, &h1); + if (!globalParams->getTextKeepTinyChars() && + fabs(w1) < 3 && fabs(h1) < 3) { + if (++nTinyChars > 20000) { + return; + } + } + n = curStr->len; + if (n > 0 && x1 - curStr->xRight[n-1] > + 0.1 * (curStr->yMax - curStr->yMin)) { + // large char spacing is sometimes used to move text around + endString(); + beginString(state, x, y); + } + if (uLen == 1 && u[0] == (Unicode)0x20 && + w1 > 0.5 * (curStr->yMax - curStr->yMin)) { + // large word spacing is sometimes used to move text around + return; + } if (uLen != 0) { w1 /= uLen; h1 /= uLen; @@ -174,9 +293,6 @@ void TextPage::addChar(GfxState *state, double x, double y, } void TextPage::endString() { - TextString *p1, *p2; - double h, y1, y2; - // This check is needed because Type 3 characters can contain // text-drawing operations. if (nest > 0) { @@ -184,58 +300,72 @@ void TextPage::endString() { return; } + addString(curStr); + curStr = NULL; +} + +void TextPage::addString(TextString *str) { + TextString *p1, *p2; + // throw away zero-length strings -- they don't have valid xMin/xMax // values, and they're useless anyway - if (curStr->len == 0) { - delete curStr; - curStr = NULL; + if (str->len == 0) { + delete str; return; } - // insert string in y-major list - h = curStr->yMax - curStr->yMin; - y1 = curStr->yMin + 0.5 * h; - y2 = curStr->yMin + 0.8 * h; + // insert string in xy list if (rawOrder) { - p1 = yxCur1; + p1 = xyCur1; p2 = NULL; - } else if ((!yxCur1 || - (y1 >= yxCur1->yMin && - (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) && - (!yxCur2 || - (y1 < yxCur2->yMin || - (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) { - p1 = yxCur1; - p2 = yxCur2; + } else if ((!xyCur1 || xyBefore(xyCur1, str)) && + (!xyCur2 || xyBefore(str, xyCur2))) { + p1 = xyCur1; + p2 = xyCur2; + } else if (xyCur1 && xyBefore(xyCur1, str)) { + for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) { + if (xyBefore(str, p2)) { + break; + } + } + xyCur2 = p2; } else { - for (p1 = NULL, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) { - if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin)) { + for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) { + if (xyBefore(str, p2)) { break; } } - yxCur2 = p2; + xyCur2 = p2; } - yxCur1 = curStr; + xyCur1 = str; if (p1) { - p1->yxNext = curStr; + p1->next = str; } else { - yxStrings = curStr; + xyStrings = str; } - curStr->yxNext = p2; - curStr = NULL; + str->next = p2; } void TextPage::coalesce() { - TextString *str1, *str2; - double space, d; - GBool addSpace; - int n, i; + TextLine *line, *line0; + TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2; + TextString *str0, *str1, *str2, *str3, *str4; + TextString *str1prev, *str2prev, *str3prev; + TextOutColumnEdge *edges; + UnicodeMap *uMap; + GBool isUnicode; + char buf[8]; + int edgesLength, edgesSize; + double x, yMin, yMax; + double space, fit1, fit2, h; + int col1, col2, d; + int i, j; #if 0 //~ for debugging - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - printf("x=%3d..%3d y=%3d..%3d size=%2d '", - (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax, - (int)(str1->yMax - str1->yMin)); + for (str1 = xyStrings; str1; str1 = str1->next) { + printf("x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", + str1->xMin, str1->xMax, str1->yMin, str1->yMax, + (str1->yMax - str1->yMin)); for (i = 0; i < str1->len; ++i) { fputc(str1->text[i] & 0xff, stdout); } @@ -243,123 +373,493 @@ void TextPage::coalesce() { } printf("\n------------------------------------------------------------\n\n"); #endif - str1 = yxStrings; - while (str1 && (str2 = str1->yxNext)) { - space = str1->yMax - str1->yMin; - d = str2->xMin - str1->xMax; - if (((rawOrder && - ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) || - (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) || - (!rawOrder && str2->yMin < str1->yMax)) && - d > -0.5 * space && d < space) { - n = str1->len + str2->len; - if ((addSpace = d > 0.1 * space)) { - ++n; - } - str1->size = (n + 15) & ~15; - str1->text = (Unicode *)grealloc(str1->text, - str1->size * sizeof(Unicode)); - str1->xRight = (double *)grealloc(str1->xRight, - str1->size * sizeof(double)); - if (addSpace) { - str1->text[str1->len] = 0x20; - str1->xRight[str1->len] = str2->xMin; - ++str1->len; - } - for (i = 0; i < str2->len; ++i) { - str1->text[str1->len] = str2->text[i]; - str1->xRight[str1->len] = str2->xRight[i]; - ++str1->len; - } - if (str2->xMax > str1->xMax) { - str1->xMax = str2->xMax; - } - if (str2->yMax > str1->yMax) { - str1->yMax = str2->yMax; - } - str1->yxNext = str2->yxNext; - delete str2; + + // build the list of column edges + edges = NULL; + edgesLength = edgesSize = 0; + if (!rawOrder) { + for (str1prev = NULL, str1 = xyStrings; + str1; + str1prev = str1, str1 = str1->next) { + if (str1->marked) { + continue; + } + h = str1->yMax - str1->yMin; + if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) { + continue; + } + x = str1->xMin; + yMin = str1->yMin; + yMax = str1->yMax; + for (str2prev = str1, str2 = str1->next; + str2; + str2prev = str2, str2 = str2->next) { + h = str2->yMax - str2->yMin; + if (!str2->marked && + (str2->xMin - str2prev->xMax) / h > textOutColSpace && + fabs(str2->xMin - x) < 0.5 && + str2->yMin - yMax < 0.3 * h && + yMin - str2->yMax < 0.3 * h) { + break; + } + } + if (str2) { + if (str2->yMin < yMin) { + yMin = str2->yMin; + } + if (str2->yMax > yMax) { + yMax = str2->yMax; + } + str2->marked = gTrue; + for (str3prev = str1, str3 = str1->next; + str3; + str3prev = str3, str3 = str3->next) { + h = str3->yMax - str3->yMin; + if (!str3->marked && + (str3->xMin - str3prev->xMax) / h > textOutColSpace && + fabs(str3->xMin - x) < 0.5 && + str3->yMin - yMax < 0.3 * h && + yMin - str3->yMax < 0.3 * h) { + break; + } + } + if (str3) { + if (str3->yMin < yMin) { + yMin = str3->yMin; + } + if (str3->yMax > yMax) { + yMax = str3->yMax; + } + str3->marked = gTrue; + do { + for (str2prev = str1, str2 = str1->next; + str2; + str2prev = str2, str2 = str2->next) { + h = str2->yMax - str2->yMin; + if (!str2->marked && + (str2->xMin - str2prev->xMax) / h > textOutColSpace && + fabs(str2->xMin - x) < 0.5 && + str2->yMin - yMax < 0.3 * h && + yMin - str2->yMax < 0.3 * h) { + if (str2->yMin < yMin) { + yMin = str2->yMin; + } + if (str2->yMax > yMax) { + yMax = str2->yMax; + } + str2->marked = gTrue; + break; + } + } + } while (str2); + if (edgesLength == edgesSize) { + edgesSize = edgesSize ? 2 * edgesSize : 16; + edges = (TextOutColumnEdge *) + grealloc(edges, edgesSize * sizeof(TextOutColumnEdge)); + } + edges[edgesLength].x = x; + edges[edgesLength].y0 = yMin; + edges[edgesLength].y1 = yMax; + ++edgesLength; + } else { + str2->marked = gFalse; + } + } + str1->marked = gTrue; + } + } + +#if 0 //~ for debugging + printf("column edges:\n"); + for (i = 0; i < edgesLength; ++i) { + printf("%d: x=%.2f y0=%.2f y1=%.2f\n", + i, edges[i].x, edges[i].y0, edges[i].y1); + } + printf("\n------------------------------------------------------------\n\n"); +#endif + + // build the blocks + yxBlocks = NULL; + blk1 = blk2 = NULL; + while (xyStrings) { + + // build the block + str0 = xyStrings; + xyStrings = xyStrings->next; + str0->next = NULL; + blk = new TextBlock(); + blk->strings = str0; + blk->xMin = str0->xMin; + blk->xMax = str0->xMax; + blk->yMin = str0->yMin; + blk->yMax = str0->yMax; + while (xyStrings) { + str1 = NULL; + str2 = xyStrings; + fit1 = coalesceFit(str0, str2); + if (!rawOrder) { + // look for best-fitting string + space = str0->yMax - str0->yMin; + for (str3 = xyStrings, str4 = xyStrings->next; + str4 && str4->xMin - str0->xMax <= space; + str3 = str4, str4 = str4->next) { + fit2 = coalesceFit(str0, str4); + if (fit2 < fit1) { + str1 = str3; + str2 = str4; + fit1 = fit2; + } + } + } + if (fit1 > 1) { + // no fit - we're done with this block + break; + } + + // if we've hit a column edge we're done with this block + if (fit1 > 0.2) { + for (i = 0; i < edgesLength; ++i) { + if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin && + str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 && + str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) { + break; + } + } + if (i < edgesLength) { + break; + } + } + + if (str1) { + str1->next = str2->next; + } else { + xyStrings = str2->next; + } + str0->next = str2; + str2->next = NULL; + if (str2->xMax > blk->xMax) { + blk->xMax = str2->xMax; + } + if (str2->yMin < blk->yMin) { + blk->yMin = str2->yMin; + } + if (str2->yMax > blk->yMax) { + blk->yMax = str2->yMax; + } + str0 = str2; + } + + // insert block on list + if (!rawOrder) { + // insert block on list in yx order + for (blk1 = NULL, blk2 = yxBlocks; + blk2 && !yxBefore(blk, blk2); + blk1 = blk2, blk2 = blk2->next) ; + } + blk->next = blk2; + if (blk1) { + blk1->next = blk; + } else { + yxBlocks = blk; + } + blk1 = blk; + } + + gfree(edges); + + // the strings are now owned by the lines/blocks tree + xyStrings = NULL; + + // build the block text + uMap = globalParams->getTextEncoding(); + isUnicode = uMap ? uMap->isUnicode() : gFalse; + for (blk = yxBlocks; blk; blk = blk->next) { + blk->len = 0; + for (str1 = blk->strings; str1; str1 = str1->next) { + blk->len += str1->len; + if (str1->next && str1->next->xMin - str1->xMax > + textOutSpace * (str1->yMax - str1->yMin)) { + str1->spaceAfter = gTrue; + ++blk->len; + } else { + str1->spaceAfter = gFalse; + } + } + blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode)); + blk->xRight = (double *)gmalloc(blk->len * sizeof(double)); + blk->col = (int *)gmalloc(blk->len * sizeof(int)); + i = 0; + for (str1 = blk->strings; str1; str1 = str1->next) { + for (j = 0; j < str1->len; ++j) { + blk->text[i] = str1->text[j]; + blk->xRight[i] = str1->xRight[j]; + ++i; + } + if (str1->spaceAfter) { + blk->text[i] = (Unicode)0x0020; + blk->xRight[i] = str1->next->xMin; + ++i; + } + } + blk->convertedLen = 0; + for (j = 0; j < blk->len; ++j) { + blk->col[j] = blk->convertedLen; + if (isUnicode) { + ++blk->convertedLen; + } else if (uMap) { + blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf)); + } + } + } + if (uMap) { + uMap->decRefCnt(); + } + +#if 0 //~ for debugging + for (blk = yxBlocks; blk; blk = blk->next) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n", + blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len); + TextString *str; + for (str = blk->strings; str; str = str->next) { + printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f'", + str->xMin, str->xMax, str->yMin, str->yMax, + (str->yMax - str->yMin)); + for (i = 0; i < str->len; ++i) { + fputc(str->text[i] & 0xff, stdout); + } + if (str->spaceAfter) { + fputc(' ', stdout); + } + printf("'\n"); + } + } + printf("\n------------------------------------------------------------\n\n"); +#endif + + // build the lines + lines = NULL; + line0 = NULL; + while (yxBlocks) { + blk0 = yxBlocks; + yxBlocks = yxBlocks->next; + blk0->next = NULL; + line = new TextLine(); + line->blocks = blk0; + line->yMin = blk0->yMin; + line->yMax = blk0->yMax; + while (yxBlocks) { + + // remove duplicated text (fake boldface, shadowed text) + h = blk0->yMax - blk0->yMin; + if (yxBlocks->len == blk0->len && + !memcmp(yxBlocks->text, blk0->text, + yxBlocks->len * sizeof(Unicode)) && + fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 && + fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 && + fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 && + fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) { + blk1 = yxBlocks; + yxBlocks = yxBlocks->next; + delete blk1; + continue; + } + + if (rawOrder && yxBlocks->yMax < blk0->yMin) { + break; + } + if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax || + yxBlocks->xMin < blk0->xMax) { + break; + } + blk1 = yxBlocks; + yxBlocks = yxBlocks->next; + blk0->next = blk1; + blk1->next = NULL; + if (blk1->yMin < line->yMin) { + line->yMin = blk1->yMin; + } + if (blk1->yMax > line->yMax) { + line->yMax = blk1->yMax; + } + blk0 = blk1; + } + if (line0) { + line0->next = line; } else { - str1 = str2; + lines = line; + } + line->next = NULL; + line0 = line; + } + + + // sort the blocks into xy order + xyBlocks = NULL; + for (line = lines; line; line = line->next) { + for (blk = line->blocks; blk; blk = blk->next) { + for (blk1 = NULL, blk2 = xyBlocks; + blk2 && !xyBefore(blk, blk2); + blk1 = blk2, blk2 = blk2->xyNext) ; + blk->xyNext = blk2; + if (blk1) { + blk1->xyNext = blk; + } else { + xyBlocks = blk; + } + } + } + +#if 0 //~ for debugging + for (blk = xyBlocks; blk; blk = blk->xyNext) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n", + blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len); + TextString *str; + for (str = blk->strings; str; str = str->next) { + printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", + str->xMin, str->xMax, str->yMin, str->yMax, + (str->yMax - str->yMin)); + for (i = 0; i < str->len; ++i) { + fputc(str->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n------------------------------------------------------------\n\n"); +#endif + + // do column assignment + for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) { + col1 = 0; + for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) { + if (blk1->xMin >= blk2->xMax) { + d = (int)((blk1->xMin - blk2->xMax) / + (0.4 * (blk1->yMax - blk1->yMin))); + if (d > 4) { + d = 4; + } + col2 = blk2->col[0] + blk2->convertedLen + d; + if (col2 > col1) { + col1 = col2; + } + } else if (blk1->xMin > blk2->xMin) { + for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ; + col2 = blk2->col[i]; + if (col2 > col1) { + col1 = col2; + } + } + } + for (j = 0; j < blk1->len; ++j) { + blk1->col[j] += col1; } } + +#if 0 //~ for debugging + for (line = lines; line; line = line->next) { + printf("[line]\n"); + for (blk = line->blocks; blk; blk = blk->next) { + printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len); + TextString *str; + for (str = blk->strings; str; str = str->next) { + printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", + str->xMin, str->xMax, str->yMin, str->yMax, + (str->yMax - str->yMin)); + for (i = 0; i < str->len; ++i) { + fputc(str->text[i] & 0xff, stdout); + } + if (str->spaceAfter) { + printf(" [space]\n"); + } + printf("'\n"); + } + } + } + printf("\n------------------------------------------------------------\n\n"); +#endif } + GBool TextPage::findText(Unicode *s, int len, GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { - TextString *str; + TextLine *line; + TextBlock *blk; Unicode *p; Unicode u1, u2; int m, i, j; - double x; + double x0, x1, x; - // scan all strings on page - for (str = yxStrings; str; str = str->yxNext) { - - // check: above top limit? - if (!top && (str->yMax < *yMin || - (str->yMin < *yMin && str->xMax <= *xMin))) { - continue; - } - - // check: below bottom limit? - if (!bottom && (str->yMin > *yMax || - (str->yMax > *yMax && str->xMin >= *xMax))) { - return gFalse; - } - - // search each position in this string - m = str->len; - for (i = 0, p = str->text; i <= m - len; ++i, ++p) { + // scan all blocks on page + for (line = lines; line; line = line->next) { + for (blk = line->blocks; blk; blk = blk->next) { // check: above top limit? - if (!top && str->yMin < *yMin) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; - if (x < *xMin) { - continue; - } + if (!top && (blk->yMax < *yMin || + (blk->yMin < *yMin && blk->xMax <= *xMin))) { + continue; } // check: below bottom limit? - if (!bottom && str->yMax > *yMax) { - x = (((i == 0) ? str->xMin : str->xRight[i-1]) + str->xRight[i]) / 2; - if (x > *xMax) { - return gFalse; - } + if (!bottom && (blk->yMin > *yMax || + (blk->yMax > *yMax && blk->xMin >= *xMax))) { + return gFalse; } - // compare the strings - for (j = 0; j < len; ++j) { -#if 1 //~ this lowercases Latin A-Z only -- this will eventually be - //~ extended to handle other character sets - if (p[j] >= 0x41 && p[j] <= 0x5a) { - u1 = p[j] + 0x20; - } else { - u1 = p[j]; + // search each position in this block + m = blk->len; + for (i = 0, p = blk->text; i <= m - len; ++i, ++p) { + + x0 = (i == 0) ? blk->xMin : blk->xRight[i-1]; + x1 = blk->xRight[i]; + x = 0.5 * (x0 + x1); + + // check: above top limit? + if (!top && blk->yMin < *yMin) { + if (x < *xMin) { + continue; + } } - if (s[j] >= 0x41 && s[j] <= 0x5a) { - u2 = s[j] + 0x20; - } else { - u2 = s[j]; + + // check: below bottom limit? + if (!bottom && blk->yMax > *yMax) { + if (x > *xMax) { + return gFalse; + } } + + // compare the strings + for (j = 0; j < len; ++j) { +#if 1 //~ this lowercases Latin A-Z only -- this will eventually be + //~ extended to handle other character sets + if (p[j] >= 0x41 && p[j] <= 0x5a) { + u1 = p[j] + 0x20; + } else { + u1 = p[j]; + } + if (s[j] >= 0x41 && s[j] <= 0x5a) { + u2 = s[j] + 0x20; + } else { + u2 = s[j]; + } #endif - if (u1 != u2) { - break; + if (u1 != u2) { + break; + } } - } - // found it - if (j == len) { - *xMin = (i == 0) ? str->xMin : str->xRight[i-1]; - *xMax = str->xRight[i + len - 1]; - *yMin = str->yMin; - *yMax = str->yMax; - return gTrue; + // found it + if (j == len) { + *xMin = x0; + *xMax = blk->xRight[i + len - 1]; + *yMin = blk->yMin; + *yMax = blk->yMax; + return gTrue; + } } } } + return gFalse; } @@ -367,18 +867,22 @@ GString *TextPage::getText(double xMin, double yMin, double xMax, double yMax) { GString *s; UnicodeMap *uMap; + GBool isUnicode; char space[8], eol[16], buf[8]; - int spaceLen, eolLen, n; - TextString *str1; - double x0, x1, x2, y; - double xPrev, yPrev; - int i1, i2, i; + int spaceLen, eolLen, len; + TextLine *line; + TextBlock *blk; + double x0, x1, y; + int firstCol, col, i; GBool multiLine; s = new GString(); + + // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return s; } + isUnicode = uMap->isUnicode(); spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = 0; // make gcc happy switch (globalParams->getTextEOL()) { @@ -393,61 +897,126 @@ GString *TextPage::getText(double xMin, double yMin, eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); break; } - xPrev = yPrev = 0; + + // find the leftmost column multiLine = gFalse; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - y = 0.5 * (str1->yMin + str1->yMax); - if (y > yMax) { + firstCol = -1; + for (line = lines; line; line = line->next) { + if (line->yMin > yMax) { break; } - if (y > yMin && str1->xMin < xMax && str1->xMax > xMin) { - x0 = x1 = x2 = str1->xMin; - for (i1 = 0; i1 < str1->len; ++i1) { - x0 = (i1==0) ? str1->xMin : str1->xRight[i1-1]; - x1 = str1->xRight[i1]; - if (0.5 * (x0 + x1) >= xMin) { - break; - } + if (line->yMax < yMin) { + continue; + } + + for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ; + if (!blk || blk->xMin > xMax) { + continue; + } + + y = 0.5 * (blk->yMin + blk->yMax); + if (y < yMin || y > yMax) { + continue; + } + + if (firstCol >= 0) { + multiLine = gTrue; + } + + i = 0; + while (1) { + x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; + x1 = blk->xRight[i]; + if (0.5 * (x0 + x1) > xMin) { + break; } - for (i2 = str1->len - 1; i2 > i1; --i2) { - x1 = (i2==0) ? str1->xMin : str1->xRight[i2-1]; - x2 = str1->xRight[i2]; - if (0.5 * (x1 + x2) <= xMax) { - break; - } + ++i; + } + col = blk->col[i]; + + if (firstCol < 0 || col < firstCol) { + firstCol = col; + } + } + + // extract the text + for (line = lines; line; line = line->next) { + if (line->yMin > yMax) { + break; + } + if (line->yMax < yMin) { + continue; + } + + for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ; + if (!blk || blk->xMin > xMax) { + continue; + } + + y = 0.5 * (blk->yMin + blk->yMax); + if (y < yMin || y > yMax) { + continue; + } + + i = 0; + while (1) { + x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; + x1 = blk->xRight[i]; + if (0.5 * (x0 + x1) > xMin) { + break; } - if (s->getLength() > 0) { - if (x0 < xPrev || str1->yMin > yPrev) { - s->append(eol, eolLen); - multiLine = gTrue; - } else { - for (i = 0; i < 4; ++i) { - s->append(space, spaceLen); - } + ++i; + } + + col = firstCol; + + do { + + // line this block up with the correct column + for (; col < blk->col[i]; ++col) { + s->append(space, spaceLen); + } + + // print the block + for (; i < blk->len; ++i) { + + x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; + x1 = blk->xRight[i]; + if (0.5 * (x0 + x1) > xMax) { + break; } + + len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf)); + s->append(buf, len); + col += isUnicode ? 1 : len; } - for (i = i1; i <= i2; ++i) { - n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf)); - s->append(buf, n); + if (i < blk->len) { + break; } - xPrev = x2; - yPrev = str1->yMax; + + // next block + blk = blk->next; + i = 0; + + } while (blk && blk->xMin < xMax); + + if (multiLine) { + s->append(eol, eolLen); } } - if (multiLine) { - s->append(eol, eolLen); - } + uMap->decRefCnt(); + return s; } void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { UnicodeMap *uMap; char space[8], eol[16], eop[8], buf[8]; - int spaceLen, eolLen, eopLen, n; - TextString *str1, *str2, *str3; - double yMin, yMax; - int col1, col2, d, i; + int spaceLen, eolLen, eopLen, len; + TextLine *line; + TextBlock *blk; + int col, d, i; // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { @@ -469,129 +1038,46 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { } eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); - // build x-major list - xyStrings = NULL; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - for (str2 = NULL, str3 = xyStrings; - str3; - str2 = str3, str3 = str3->xyNext) { - if (str1->xMin < str3->xMin || - (str1->xMin == str3->xMin && str1->yMin < str3->yMin)) { - break; - } - } - if (str2) { - str2->xyNext = str1; - } else { - xyStrings = str1; - } - str1->xyNext = str3; - } - - // do column assignment - for (str1 = xyStrings; str1; str1 = str1->xyNext) { - col1 = 0; - for (str2 = xyStrings; str2 != str1; str2 = str2->xyNext) { - if (str1->xMin >= str2->xMax) { - col2 = str2->col + str2->len + 4; - if (col2 > col1) { - col1 = col2; - } - } else if (str1->xMin > str2->xMin) { - col2 = str2->col + - (int)(((str1->xMin - str2->xMin) / (str2->xMax - str2->xMin)) * - str2->len); - if (col2 > col1) { - col1 = col2; + // output + for (line = lines; line; line = line->next) { + col = 0; + for (blk = line->blocks; blk; blk = blk->next) { + + // line this block up with the correct column + if (rawOrder && col == 0) { + col = blk->col[0]; + } else { + for (; col < blk->col[0]; ++col) { + (*outputFunc)(outputStream, space, spaceLen); } } - } - str1->col = col1; - } - -#if 0 //~ for debugging - fprintf((FILE *)outputStream, "~~~~~~~~~~\n"); - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - fprintf((FILE *)outputStream, "(%4d,%4d) - (%4d,%4d) [%3d] '", - (int)str1->xMin, (int)str1->yMin, - (int)str1->xMax, (int)str1->yMax, str1->col); - for (i = 0; i < str1->len; ++i) { - fputc(str1->text[i] & 0xff, stdout); - } - printf("'\n"); - } - fprintf((FILE *)outputStream, "~~~~~~~~~~\n"); -#endif - - // output - col1 = 0; - yMax = yxStrings ? yxStrings->yMax : 0; - for (str1 = yxStrings; str1; str1 = str1->yxNext) { - // line this string up with the correct column - if (rawOrder && col1 == 0) { - col1 = str1->col; - } else { - for (; col1 < str1->col; ++col1) { - (*outputFunc)(outputStream, space, spaceLen); + // print the block + for (i = 0; i < blk->len; ++i) { + len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf)); + (*outputFunc)(outputStream, buf, len); } + col += blk->convertedLen; } - // print the string - for (i = 0; i < str1->len; ++i) { - if ((n = uMap->mapUnicode(str1->text[i], buf, sizeof(buf))) > 0) { - (*outputFunc)(outputStream, buf, n); + // print a return + (*outputFunc)(outputStream, eol, eolLen); + + // print extra vertical space if necessary + if (line->next) { + d = (int)((line->next->yMin - line->yMax) / + (line->blocks->strings->yMax - lines->blocks->strings->yMin) + + 0.5); + // various things (weird font matrices) can result in bogus + // values here, so do a sanity check + if (rawOrder && d > 2) { + d = 2; + } else if (!rawOrder && d > 5) { + d = 5; } - } - - // increment column - col1 += str1->len; - - // update yMax for this line - if (str1->yMax > yMax) { - yMax = str1->yMax; - } - - // if we've hit the end of the line... - if (!(str1->yxNext && - !(rawOrder && str1->yxNext->yMax < str1->yMin) && - str1->yxNext->yMin < 0.2*str1->yMin + 0.8*str1->yMax && - str1->yxNext->xMin >= str1->xMax)) { - - // print a return - (*outputFunc)(outputStream, eol, eolLen); - - // print extra vertical space if necessary - if (str1->yxNext) { - - // find yMin for next line - yMin = str1->yxNext->yMin; - for (str2 = str1->yxNext; str2; str2 = str2->yxNext) { - if (str2->yMin < yMin) { - yMin = str2->yMin; - } - if (!(str2->yxNext && str2->yxNext->yMin < str2->yMax && - str2->yxNext->xMin >= str2->xMax)) - break; - } - - // print the space - d = (int)((yMin - yMax) / (str1->yMax - str1->yMin) + 0.5); - // various things (weird font matrices) can result in bogus - // values here, so do a sanity check - if (rawOrder && d > 2) { - d = 2; - } else if (!rawOrder && d > 5) { - d = 5; - } - for (; d > 0; --d) { - (*outputFunc)(outputStream, eol, eolLen); - } + for (; d > 0; --d) { + (*outputFunc)(outputStream, eol, eolLen); } - - // set up for next line - col1 = 0; - yMax = str1->yxNext ? str1->yxNext->yMax : 0; } } @@ -603,20 +1089,89 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { uMap->decRefCnt(); } +// Returns true if <str1> should be inserted before <str2> in xy +// order. +GBool TextPage::xyBefore(TextString *str1, TextString *str2) { + return str1->xMin < str2->xMin || + (str1->xMin == str2->xMin && str1->yMin < str2->yMin); +} + +// Returns true if <blk1> should be inserted before <blk2> in xy +// order. +GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) { + return blk1->xMin < blk2->xMin || + (blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin); +} + +// Returns true if <blk1> should be inserted before <blk2> in yx +// order, allowing a little slack for vertically overlapping text. +GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) { + double h1, h2, overlap; + + h1 = blk1->yMax - blk1->yMin; + h2 = blk2->yMax - blk2->yMin; + overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) - + (blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) / + (h1 < h2 ? h1 : h2); + if (overlap > 0.6) { + return blk1->xMin < blk2->xMin; + } + return blk1->yMin < blk2->yMin; +} + +double TextPage::coalesceFit(TextString *str1, TextString *str2) { + double h1, h2, w1, w2, r, overlap, spacing; + + h1 = str1->yMax - str1->yMin; + h2 = str2->yMax - str2->yMin; + w1 = str1->xMax - str1->xMin; + w2 = str2->xMax - str2->xMin; + r = h1 / h2; + if (r < (1.0 / 3.0) || r > 3) { + return 10; + } + overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) - + (str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) / + (h1 < h2 ? h1 : h2); + if (overlap < 0.5) { + return 10; + } + spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2); + if (spacing < -0.5) { + return 10; + } + // separate text that overlaps - duplicated text (so that fake + // boldface and shadowed text can be cleanly removed) + if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) { + return 10; + } + return spacing; +} + void TextPage::clear() { - TextString *p1, *p2; + TextLine *p1, *p2; + TextString *s1, *s2; if (curStr) { delete curStr; curStr = NULL; } - for (p1 = yxStrings; p1; p1 = p2) { - p2 = p1->yxNext; - delete p1; + if (lines) { + for (p1 = lines; p1; p1 = p2) { + p2 = p1->next; + delete p1; + } + } else if (xyStrings) { + for (s1 = xyStrings; s1; s1 = s2) { + s2 = s1->next; + delete s1; + } } - yxStrings = NULL; xyStrings = NULL; - yxCur1 = yxCur2 = NULL; + xyCur1 = xyCur2 = NULL; + lines = NULL; + nest = 0; + nTinyChars = 0; } //------------------------------------------------------------------------ @@ -691,7 +1246,7 @@ void TextOutputDev::updateFont(GfxState *state) { } void TextOutputDev::beginString(GfxState *state, GString *s) { - text->beginString(state); + text->beginString(state, state->getCurX(), state->getCurY()); } void TextOutputDev::endString(GfxState *state) { @@ -711,3 +1266,9 @@ GBool TextOutputDev::findText(Unicode *s, int len, double *xMax, double *yMax) { return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax); } + +GString *TextOutputDev::getText(double xMin, double yMin, + double xMax, double yMax) { + return text->getText(xMin, yMin, xMax, yMax); +} + |