//======================================================================== // // TextOutputDev.cc // // Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== #include #ifdef USE_GCC_PRAGMAS #pragma implementation #endif #include #include #include #include #include #ifdef WIN32 #include // for O_BINARY #include // for setmode #endif #include "gmem.h" #include "GString.h" #include "GList.h" #include "config.h" #include "Error.h" #include "GlobalParams.h" #include "UnicodeMap.h" #include "GfxState.h" #include "TextOutputDev.h" #ifdef MACOS // needed for setting type/creator of MacOS files #include "ICSupport.h" #endif //------------------------------------------------------------------------ // parameters //------------------------------------------------------------------------ // Minium and maximum inter-word spacing (as a fraction of the average // character width). #define wordMinSpaceWidth 0.3 #define wordMaxSpaceWidth 2.0 // Default min and max inter-word spacing (when the average character // width is unknown). #define wordDefMinSpaceWidth 0.2 #define wordDefMaxSpaceWidth 1.5 // Max difference in x,y coordinates (as a fraction of the font size) // allowed for duplicated text (fake boldface, drop shadows) which is // to be discarded. #define dupMaxDeltaX 0.1 #define dupMaxDeltaY 0.2 // Min overlap (as a fraction of the font size) required for two // lines to be considered vertically overlapping. #define lineOverlapSlack 0.5 // Max difference in baseline y coordinates (as a fraction of the font // size) allowed for words which are to be grouped into a line, not // including sub/superscripts. #define lineMaxBaselineDelta 0.1 // Max ratio of font sizes allowed for words which are to be grouped // into a line, not including sub/superscripts. #define lineMaxFontSizeRatio 1.4 // Min spacing (as a fraction of the font size) allowed between words // which are to be grouped into a line. #define lineMinDeltaX -0.5 // Minimum vertical overlap (as a fraction of the font size) required // for superscript and subscript words. #define lineMinSuperscriptOverlap 0.3 #define lineMinSubscriptOverlap 0.3 // Min/max ratio of font sizes allowed for sub/superscripts compared to // the base text. #define lineMinSubscriptFontSizeRatio 0.4 #define lineMaxSubscriptFontSizeRatio 1.01 #define lineMinSuperscriptFontSizeRatio 0.4 #define lineMaxSuperscriptFontSizeRatio 1.01 // Max horizontal spacing (as a fraction of the font size) allowed // before sub/superscripts. #define lineMaxSubscriptDeltaX 0.2 #define lineMaxSuperscriptDeltaX 0.2 // Maximum vertical spacing (as a fraction of the font size) allowed // for lines which are to be grouped into a block. #define blkMaxSpacing 2.0 // Max ratio of primary font sizes allowed for lines which are to be // grouped into a block. #define blkMaxFontSizeRatio 1.3 // Min overlap (as a fraction of the font size) required for two // blocks to be considered vertically overlapping. #define blkOverlapSlack 0.5 // Max vertical spacing (as a fraction of the font size) allowed // between blocks which are 'adjacent' when sorted by reading order. #define blkMaxSortSpacing 2.0 // Max vertical offset (as a fraction of the font size) of the top and // bottom edges allowed for blocks which are to be grouped into a // flow. #define flowMaxDeltaY 1.0 //------------------------------------------------------------------------ // TextFontInfo //------------------------------------------------------------------------ TextFontInfo::TextFontInfo(GfxState *state) { double *textMat; double t1, t2, avgWidth, w; int n, i; gfxFont = state->getFont(); textMat = state->getTextMat(); horizScaling = state->getHorizScaling(); if ((t1 = fabs(textMat[0])) > 0.01 && (t2 = fabs(textMat[3])) > 0.01) { horizScaling *= t1 / t2; } minSpaceWidth = horizScaling * wordDefMinSpaceWidth; maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth; if (gfxFont && gfxFont->isCIDFont()) { //~ handle 16-bit fonts } else if (gfxFont && gfxFont->getType() != fontType3) { avgWidth = 0; n = 0; for (i = 0; i < 256; ++i) { w = ((Gfx8BitFont *)gfxFont)->getWidth(i); if (w > 0) { avgWidth += w; ++n; } } if (n > 0) { avgWidth /= n; minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth; maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth; } } } TextFontInfo::~TextFontInfo() { } GBool TextFontInfo::matches(GfxState *state) { double *textMat; double t1, t2, h; textMat = state->getTextMat(); h = state->getHorizScaling(); if ((t1 = fabs(textMat[0])) > 0.01 && (t2 = fabs(textMat[3])) > 0.01) { h *= t1 / t2; } return state->getFont() == gfxFont && fabs(h - horizScaling) < 0.01; } //------------------------------------------------------------------------ // TextWord //------------------------------------------------------------------------ TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA, TextFontInfo *fontA, double fontSizeA) { GfxFont *gfxFont; double x, y; charPos = charPosA; charLen = 0; font = fontA; fontSize = fontSizeA; state->transform(x0, y0, &x, &y); if ((gfxFont = font->gfxFont)) { yMin = y - gfxFont->getAscent() * fontSize; yMax = y - gfxFont->getDescent() * fontSize; } else { // this means that the PDF file draws text without a current font, // which should never happen yMin = y - 0.95 * fontSize; yMax = y + 0.35 * fontSize; } if (yMin == yMax) { // this is a sanity check for a case that shouldn't happen -- but // if it does happen, we want to avoid dividing by zero later yMin = y; yMax = y + 1; } yBase = y; text = NULL; xRight = NULL; len = size = 0; spaceAfter = gFalse; next = NULL; } TextWord::~TextWord() { gfree(text); gfree(xRight); } void TextWord::addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u) { if (len == size) { size += 16; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); xRight = (double *)grealloc(xRight, size * sizeof(double)); } text[len] = u; if (len == 0) { xMin = x; } xMax = xRight[len] = x + dx; ++len; } // Returns true if comes before in xy order. GBool TextWord::xyBefore(TextWord *word2) { return xMin < word2->xMin || (xMin == word2->xMin && yMin < word2->yMin); } // Merge another word onto the end of this one. void TextWord::merge(TextWord *word2) { int i; xMax = word2->xMax; if (word2->yMin < yMin) { yMin = word2->yMin; } if (word2->yMax > yMax) { yMax = word2->yMax; } if (len + word2->len > size) { size = len + word2->len; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); xRight = (double *)grealloc(xRight, size * sizeof(double)); } for (i = 0; i < word2->len; ++i) { text[len + i] = word2->text[i]; xRight[len + i] = word2->xRight[i]; } len += word2->len; charLen += word2->charLen; } //------------------------------------------------------------------------ // TextLine //------------------------------------------------------------------------ TextLine::TextLine() { words = NULL; text = NULL; xRight = NULL; col = NULL; len = 0; hyphenated = gFalse; pageNext = NULL; next = NULL; flowNext = NULL; } TextLine::~TextLine() { TextWord *w1, *w2; for (w1 = words; w1; w1 = w2) { w2 = w1->next; delete w1; } gfree(text); gfree(xRight); gfree(col); } // Returns true if comes before in yx order, allowing // slack for vertically overlapping lines. GBool TextLine::yxBefore(TextLine *line2) { double dy; dy = lineOverlapSlack * fontSize; // non-overlapping case if (line2->yMin > yMax - dy || line2->yMax < yMin + dy) { return yMin < line2->yMin || (yMin == line2->yMin && xMin < line2->xMin); } // overlapping case return xMin < line2->xMin; } // Merge another line's words onto the end of this line. void TextLine::merge(TextLine *line2) { int newLen, i; xMax = line2->xMax; if (line2->yMin < yMin) { yMin = line2->yMin; } if (line2->yMax > yMax) { yMax = line2->yMax; } xSpaceR = line2->xSpaceR; lastWord->spaceAfter = gTrue; lastWord->next = line2->words; lastWord = line2->lastWord; line2->words = NULL; newLen = len + 1 + line2->len; text = (Unicode *)grealloc(text, newLen * sizeof(Unicode)); xRight = (double *)grealloc(xRight, newLen * sizeof(double)); text[len] = (Unicode)0x0020; xRight[len] = line2->xMin; for (i = 0; i < line2->len; ++i) { text[len + 1 + i] = line2->text[i]; xRight[len + 1 + i] = line2->xRight[i]; } len = newLen; convertedLen += line2->convertedLen; hyphenated = line2->hyphenated; } //------------------------------------------------------------------------ // TextBlock //------------------------------------------------------------------------ TextBlock::TextBlock() { lines = NULL; next = NULL; } TextBlock::~TextBlock() { TextLine *l1, *l2; for (l1 = lines; l1; l1 = l2) { l2 = l1->next; delete l1; } } // Returns true if comes before in xy order, allowing // slack for vertically overlapping blocks. GBool TextBlock::yxBefore(TextBlock *blk2) { double dy; dy = blkOverlapSlack * lines->fontSize; // non-overlapping case if (blk2->yMin > yMax - dy || blk2->yMax < yMin + dy) { return yMin < blk2->yMin || (yMin == blk2->yMin && xMin < blk2->xMin); } // overlapping case return xMin < blk2->xMin; } // Merge another block's line onto the right of this one. void TextBlock::mergeRight(TextBlock *blk2) { lines->merge(blk2->lines); xMax = lines->xMax; yMin = lines->yMin; yMax = lines->yMax; xSpaceR = lines->xSpaceR; } // Merge another block's lines onto the bottom of this block. void TextBlock::mergeBelow(TextBlock *blk2) { TextLine *line; if (blk2->xMin < xMin) { xMin = blk2->xMin; } if (blk2->xMax > xMax) { xMax = blk2->xMax; } yMax = blk2->yMax; if (blk2->xSpaceL > xSpaceL) { xSpaceL = blk2->xSpaceL; } if (blk2->xSpaceR < xSpaceR) { xSpaceR = blk2->xSpaceR; } if (blk2->maxFontSize > maxFontSize) { maxFontSize = blk2->maxFontSize; } for (line = lines; line->next; line = line->next) ; line->next = line->flowNext = blk2->lines; blk2->lines = NULL; } //------------------------------------------------------------------------ // TextFlow //------------------------------------------------------------------------ TextFlow::TextFlow() { blocks = NULL; next = NULL; } TextFlow::~TextFlow() { TextBlock *b1, *b2; for (b1 = blocks; b1; b1 = b2) { b2 = b1->next; delete b1; } } //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ TextPage::TextPage(GBool rawOrderA) { rawOrder = rawOrderA; curWord = NULL; charPos = 0; font = NULL; fontSize = 0; nest = 0; nTinyChars = 0; words = wordPtr = NULL; lines = NULL; flows = NULL; fonts = new GList(); } TextPage::~TextPage() { clear(); delete fonts; } void TextPage::updateFont(GfxState *state) { GfxFont *gfxFont; double *fm; char *name; int code, mCode, letterCode, anyCode; double w; int i; // get the font info object font = NULL; for (i = 0; i < fonts->getLength(); ++i) { font = (TextFontInfo *)fonts->get(i); if (font->matches(state)) { break; } font = NULL; } if (!font) { font = new TextFontInfo(state); fonts->append(font); } // adjust the font size gfxFont = state->getFont(); fontSize = state->getTransformedFontSize(); if (gfxFont && gfxFont->getType() == fontType3) { // This is a hack which makes it possible to deal with some Type 3 // fonts. The problem is that it's impossible to know what the // base coordinate system used in the font is without actually // rendering the font. This code tries to guess by looking at the // width of the character 'm' (which breaks if the font is a // subset that doesn't contain 'm'). mCode = letterCode = anyCode = -1; for (code = 0; code < 256; ++code) { name = ((Gfx8BitFont *)gfxFont)->getCharName(code); if (name && name[0] == 'm' && name[1] == '\0') { mCode = code; } if (letterCode < 0 && name && name[1] == '\0' && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) { letterCode = code; } if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) { anyCode = code; } } if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { // 0.6 is a generic average 'm' width -- yes, this is a hack fontSize *= w / 0.6; } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { // even more of a hack: 0.5 is a generic letter width fontSize *= w / 0.5; } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { // better than nothing: 0.5 is a generic character width fontSize *= w / 0.5; } fm = gfxFont->getFontMatrix(); if (fm[0] != 0) { fontSize *= fabs(fm[3] / fm[0]); } } } void TextPage::beginWord(GfxState *state, double x0, double y0) { // This check is needed because Type 3 characters can contain // text-drawing operations (when TextPage is being used via // XOutputDev rather than TextOutputDev). if (curWord) { ++nest; return; } curWord = new TextWord(state, x0, y0, charPos, font, fontSize); } void TextPage::addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, Unicode *u, int uLen) { double x1, y1, w1, h1, dx2, dy2, sp; int n, i; // if the previous char was a space, addChar will have called // endWord, so we need to start a new word if (!curWord) { beginWord(state, x, y); } // throw away chars that aren't inside the page bounds state->transform(x, y, &x1, &y1); if (x1 < 0 || x1 > pageWidth || y1 < 0 || y1 > pageHeight) { return; } // subtract char and word spacing from the dx,dy values sp = state->getCharSpace(); if (c == (CharCode)0x20) { sp += state->getWordSpace(); } state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); dx -= dx2; dy -= dy2; state->transformDelta(dx, dy, &w1, &h1); // check the tiny chars limit if (!globalParams->getTextKeepTinyChars() && fabs(w1) < 3 && fabs(h1) < 3) { if (++nTinyChars > 20000) { return; } } // break words at space character if (uLen == 1 && u[0] == (Unicode)0x20) { ++curWord->charLen; ++charPos; endWord(); return; } // large char spacing is sometimes used to move text around -- in // this case, break text into individual chars and let the coalesce // function deal with it later n = curWord->len; if (n > 0 && x1 - curWord->xRight[n-1] > curWord->font->minSpaceWidth * curWord->fontSize) { endWord(); beginWord(state, x, y); } // page rotation and/or transform matrices can cause text to be // drawn in reverse order -- in this case, swap the begin/end // coordinates and break text into individual chars if (w1 < 0) { endWord(); beginWord(state, x + dx, y + dy); x1 += w1; y1 += h1; w1 = -w1; h1 = -h1; } // add the characters to the current word if (uLen != 0) { w1 /= uLen; h1 /= uLen; } for (i = 0; i < uLen; ++i) { curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); } ++curWord->charLen; ++charPos; } void TextPage::endWord() { // This check is needed because Type 3 characters can contain // text-drawing operations (when TextPage is being used via // XOutputDev rather than TextOutputDev). if (nest > 0) { --nest; return; } if (curWord) { addWord(curWord); curWord = NULL; } } void TextPage::addWord(TextWord *word) { TextWord *p1, *p2; // throw away zero-length words -- they don't have valid xMin/xMax // values, and they're useless anyway if (word->len == 0) { delete word; return; } // insert word in xy list if (rawOrder) { p1 = wordPtr; p2 = NULL; } else { if (wordPtr && wordPtr->xyBefore(word)) { p1 = wordPtr; p2 = wordPtr->next; } else { p1 = NULL; p2 = words; } for (; p2; p1 = p2, p2 = p2->next) { if (word->xyBefore(p2)) { break; } } } if (p1) { p1->next = word; } else { words = word; } word->next = p2; wordPtr = word; } void TextPage::coalesce(GBool physLayout) { TextWord *word0, *word1, *word2; TextLine *line0, *line1, *line2, *line3, *line4, *lineList; TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6; TextBlock *yxBlocks, *blocks, *blkStack; TextFlow *flow0, *flow1; double sz, xLimit, yLimit; double fit1, fit2, sp1, sp2 = 0.0e+0; GBool found; UnicodeMap *uMap; GBool isUnicode; char buf[8]; int col1, col2, d, i, j; #if 0 // for debugging printf("*** initial word list ***\n"); for (word0 = words; word0; word0 = word0->next) { printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } printf("\n"); fflush(stdout); #endif //----- discard duplicated text (fake boldface, drop shadows) word0 = words; while (word0) { sz = word0->fontSize; xLimit = word0->xMin + sz * dupMaxDeltaX; found = gFalse; for (word1 = word0, word2 = word0->next; word2 && word2->xMin < xLimit; word1 = word2, word2 = word2->next) { if (word2->len == word0->len && !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) && fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY && fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY && fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) { found = gTrue; break; } } if (found) { word1->next = word2->next; delete word2; } else { word0 = word0->next; } } #if 0 // for debugging printf("*** words after removing duplicate text ***\n"); for (word0 = words; word0; word0 = word0->next) { printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } printf("\n"); fflush(stdout); #endif //----- merge words word0 = words; while (word0) { sz = word0->fontSize; // look for adjacent text which is part of the same word, and // merge it into this word xLimit = word0->xMax + sz * word0->font->minSpaceWidth; if (rawOrder) { word1 = word0; word2 = word0->next; found = word2 && word2->xMin < xLimit && word2->font == word0->font && fabs(word2->fontSize - sz) < 0.05 && fabs(word2->yBase - word0->yBase) < 0.05 && word2->charPos == word0->charPos + word0->charLen; } else { found = gFalse; for (word1 = word0, word2 = word0->next; word2 && word2->xMin < xLimit; word1 = word2, word2 = word2->next) { if (word2->font == word0->font && fabs(word2->fontSize - sz) < 0.05 && fabs(word2->yBase - word0->yBase) < 0.05 && word2->charPos == word0->charPos + word0->charLen) { found = gTrue; break; } } } if (found) { word0->merge(word2); word1->next = word2->next; delete word2; continue; } word0 = word0->next; } #if 0 // for debugging printf("*** after merging words ***\n"); for (word0 = words; word0; word0 = word0->next) { printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } printf("\n"); fflush(stdout); #endif //----- assemble words into lines lineList = line0 = NULL; while (words) { // remove the first word from the word list word0 = words; words = words->next; word0->next = NULL; // find the best line (if any) for the word if (rawOrder) { if (line0 && lineFit(line0, word0, &sp2) >= 0) { line1 = line0; sp1 = sp2; } else { line1 = NULL; sp1 = 0; } } else { line1 = NULL; fit1 = 0; sp1 = 0; for (line2 = lineList; line2; line2 = line2->next) { fit2 = lineFit(line2, word0, &sp2); if (fit2 >= 0 && (!line1 || fit2 < fit1)) { line1 = line2; fit1 = fit2; sp1 = sp2; } } } // found a line: append the word if (line1) { word1 = line1->lastWord; word1->next = word0; line1->lastWord = word0; if (word0->xMax > line1->xMax) { line1->xMax = word0->xMax; } if (word0->yMin < line1->yMin) { line1->yMin = word0->yMin; } if (word0->yMax > line1->yMax) { line1->yMax = word0->yMax; } line1->len += word0->len; if (sp1 > line1->fontSize * line1->font->minSpaceWidth) { word1->spaceAfter = gTrue; ++line1->len; } // didn't find a line: create a new line } else { line1 = new TextLine(); line1->words = line1->lastWord = word0; line1->xMin = word0->xMin; line1->xMax = word0->xMax; line1->yMin = word0->yMin; line1->yMax = word0->yMax; line1->yBase = word0->yBase; line1->font = word0->font; line1->fontSize = word0->fontSize; line1->len = word0->len; if (line0) { line0->next = line1; } else { lineList = line1; } line0 = line1; } } // build the line text uMap = globalParams->getTextEncoding(); isUnicode = uMap ? uMap->isUnicode() : gFalse; for (line1 = lineList; line1; line1 = line1->next) { line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode)); line1->xRight = (double *)gmalloc(line1->len * sizeof(double)); line1->col = (int *)gmalloc(line1->len * sizeof(int)); i = 0; for (word1 = line1->words; word1; word1 = word1->next) { for (j = 0; j < word1->len; ++j) { line1->text[i] = word1->text[j]; line1->xRight[i] = word1->xRight[j]; ++i; } if (word1->spaceAfter && word1->next) { line1->text[i] = (Unicode)0x0020; line1->xRight[i] = word1->next->xMin; ++i; } } line1->convertedLen = 0; for (j = 0; j < line1->len; ++j) { line1->col[j] = line1->convertedLen; if (isUnicode) { ++line1->convertedLen; } else if (uMap) { line1->convertedLen += uMap->mapUnicode(line1->text[j], buf, sizeof(buf)); } } // check for hyphen at end of line //~ need to check for other chars used as hyphens if (line1->text[line1->len - 1] == (Unicode)'-') { line1->hyphenated = gTrue; } } if (uMap) { uMap->decRefCnt(); } #if 0 // for debugging printf("*** lines in xy order ***\n"); for (line0 = lineList; line0; line0 = line0->next) { printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", line0->xMin, line0->xMax, line0->yMin, line0->yMax, line0->yBase, line0->len); for (word0 = line0->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } printf("\n"); fflush(stdout); #endif //----- column assignment for (line1 = lineList; line1; line1 = line1->next) { col1 = 0; for (line2 = lineList; line2 != line1; line2 = line2->next) { if (line1->xMin >= line2->xMax) { d = (int)((line1->xMin - line2->xMax) / (line1->font->maxSpaceWidth * line1->fontSize)); if (d > 4) { d = 4; } col2 = line2->col[0] + line2->convertedLen + d; if (col2 > col1) { col1 = col2; } } else if (line1->xMin > line2->xMin) { for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ; col2 = line2->col[i]; if (col2 > col1) { col1 = col2; } } } for (j = 0; j < line1->len; ++j) { line1->col[j] += col1; } } #if 0 // for debugging printf("*** lines in xy order, after column assignment ***\n"); for (line0 = lineList; line0; line0 = line0->next) { printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n", line0->xMin, line0->xMax, line0->yMin, line0->yMax, line0->yBase, line0->col[0], line0->len); for (word0 = line0->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } printf("\n"); fflush(stdout); #endif //----- assemble lines into blocks if (rawOrder) { lines = lineList; for (line1 = lines; line1; line1 = line1->next) { line1->xSpaceL = 0; line1->xSpaceR = pageWidth; } } else { // sort lines into yx order lines = NULL; while (lineList) { line0 = lineList; lineList = lineList->next; for (line1 = NULL, line2 = lines; line2 && !line0->yxBefore(line2); line1 = line2, line2 = line2->next) ; if (line1) { line1->next = line0; } else { lines = line0; } line0->next = line2; } // compute whitespace to left and right of each line line0 = lines; for (line1 = lines; line1; line1 = line1->next) { // find the first vertically overlapping line for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ; // check each vertically overlapping line -- look for the nearest // on each side line1->xSpaceL = 0; line1->xSpaceR = pageWidth; for (line2 = line0; line2 && line2->yMin < line1->yMax; line2 = line2->next) { if (line2->yMax > line1->yMin) { if (line2->xMax < line1->xMin) { if (line2->xMax > line1->xSpaceL) { line1->xSpaceL = line2->xMax; } } else if (line2->xMin > line1->xMax) { if (line2->xMin < line1->xSpaceR) { line1->xSpaceR = line2->xMin; } } } } } } // (!rawOrder) #if 0 // for debugging printf("*** lines in yx order ***\n"); for (line0 = lines; line0; line0 = line0->next) { printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n", line0->xMin, line0->xMax, line0->yMin, line0->yMax, line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len); for (word0 = line0->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } printf("\n"); fflush(stdout); #endif lineList = lines; yxBlocks = NULL; blk0 = NULL; while (lineList) { // build a new block object line0 = lineList; lineList = lineList->next; line0->next = NULL; blk1 = new TextBlock(); blk1->lines = line0; blk1->xMin = line0->xMin; blk1->xMax = line0->xMax; blk1->yMin = line0->yMin; blk1->yMax = line0->yMax; blk1->xSpaceL = line0->xSpaceL; blk1->xSpaceR = line0->xSpaceR; blk1->maxFontSize = line0->fontSize; // find subsequent lines in the block while (lineList) { // look for the first horizontally overlapping line below this // one yLimit = line0->yMax + blkMaxSpacing * line0->fontSize; line3 = line4 = NULL; if (rawOrder) { if (lineList->yMin < yLimit && lineList->xMax > blk1->xMin && lineList->xMin < blk1->xMax) { line3 = NULL; line4 = lineList; } } else { for (line1 = NULL, line2 = lineList; line2 && line2->yMin < yLimit; line1 = line2, line2 = line2->next) { if (line2->xMax > blk1->xMin && line2->xMin < blk1->xMax) { line3 = line1; line4 = line2; break; } } } // if there is an overlapping line and it fits in the block, add // it to the block if (line4 && blockFit(blk1, line4)) { if (line3) { line3->next = line4->next; } else { lineList = line4->next; } line0->next = line0->flowNext = line4; line4->next = NULL; if (line4->xMin < blk1->xMin) { blk1->xMin = line4->xMin; } else if (line4->xMax > blk1->xMax) { blk1->xMax = line4->xMax; } if (line4->yMax > blk1->yMax) { blk1->yMax = line4->yMax; } if (line4->xSpaceL > blk1->xSpaceL) { blk1->xSpaceL = line4->xSpaceL; } if (line4->xSpaceR < blk1->xSpaceR) { blk1->xSpaceR = line4->xSpaceR; } if (line4->fontSize > blk1->maxFontSize) { blk1->maxFontSize = line4->fontSize; } line0 = line4; // otherwise, we're done with this block } else { break; } } // insert block on list, in yx order if (rawOrder) { blk2 = blk0; blk3 = NULL; blk0 = blk1; } else { for (blk2 = NULL, blk3 = yxBlocks; blk3 && !blk1->yxBefore(blk3); blk2 = blk3, blk3 = blk3->next) ; } blk1->next = blk3; if (blk2) { blk2->next = blk1; } else { yxBlocks = blk1; } } #if 0 // for debugging printf("*** blocks in yx order ***\n"); for (blk0 = yxBlocks; blk0; blk0 = blk0->next) { printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); for (line0 = blk0->lines; line0; line0 = line0->next) { printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", line0->xMin, line0->xMax, line0->yMin, line0->yMax, line0->yBase, line0->len); for (word0 = line0->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } } printf("\n"); fflush(stdout); #endif //----- merge lines and blocks, sort blocks into reading order if (rawOrder) { blocks = yxBlocks; } else { blocks = NULL; blk0 = NULL; blkStack = NULL; while (yxBlocks) { // find the next two blocks: // - if the depth-first traversal stack is empty, take the first // (upper-left-most) two blocks on the yx-sorted block list // - otherwise, find the two upper-left-most blocks under the top // block on the stack if (blkStack) { blk3 = blk4 = blk5 = blk6 = NULL; for (blk1 = NULL, blk2 = yxBlocks; blk2; blk1 = blk2, blk2 = blk2->next) { if (blk2->yMin > blkStack->yMin && blk2->xMax > blkStack->xMin && blk2->xMin < blkStack->xMax) { if (!blk4 || blk2->yxBefore(blk4)) { blk5 = blk3; blk6 = blk4; blk3 = blk1; blk4 = blk2; } else if (!blk6 || blk2->yxBefore(blk6)) { blk5 = blk1; blk6 = blk2; } } } } else { blk3 = NULL; blk4 = yxBlocks; blk5 = yxBlocks; blk6 = yxBlocks->next; } // merge case 1: // | | | // | blkStack | | blkStack // +---------------------+ --> +-------------- // +------+ +------+ +-----------+ // | blk4 | | blk6 | ... | blk4+blk6 | // +------+ +------+ +-----------+ yLimit = 0; // make gcc happy if (blkStack) { yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize; } if (blkStack && blk4 && blk6 && !blk4->lines->next && !blk6->lines->next && lineFit2(blk4->lines, blk6->lines) && blk4->yMin < yLimit && blk4->xMin > blkStack->xSpaceL && blkStack->xMin > blk4->xSpaceL && blk6->xMax < blkStack->xSpaceR) { blk4->mergeRight(blk6); if (blk5) { blk5->next = blk6->next; } else { yxBlocks = blk6->next; } delete blk6; // merge case 2: // | | | | // | blkStack | | | // +---------------------+ --> | blkStack+blk2 | // +---------------------+ | | // | blk4 | | | // | | | | } else if (blkStack && blk4 && blk4->yMin < yLimit && blockFit2(blkStack, blk4)) { blkStack->mergeBelow(blk4); if (blk3) { blk3->next = blk4->next; } else { yxBlocks = blk4->next; } delete blk4; // if any of: // 1. no block found // 2. non-fully overlapping block found // 3. large vertical gap above the overlapping block // then pop the stack and try again } else if (!blk4 || (blkStack && (blk4->xMin < blkStack->xSpaceL || blk4->xMax > blkStack->xSpaceR || blk4->yMin - blkStack->yMax > blkMaxSortSpacing * blkStack->maxFontSize))) { blkStack = blkStack->stackNext; // add a block to the sorted list } else { // remove the block from the yx-sorted list if (blk3) { blk3->next = blk4->next; } else { yxBlocks = blk4->next; } blk4->next = NULL; // append the block to the reading-order list if (blk0) { blk0->next = blk4; } else { blocks = blk4; } blk0 = blk4; // push the block on the traversal stack if (!physLayout) { blk4->stackNext = blkStack; blkStack = blk4; } } } } // (!rawOrder) #if 0 // for debugging printf("*** blocks in reading order (after merging) ***\n"); for (blk0 = blocks; blk0; blk0 = blk0->next) { printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); for (line0 = blk0->lines; line0; line0 = line0->next) { printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", line0->xMin, line0->xMax, line0->yMin, line0->yMax, line0->yBase, line0->len); for (word0 = line0->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } } printf("\n"); fflush(stdout); #endif //----- assemble blocks into flows if (rawOrder) { // one flow per block flow0 = NULL; while (blocks) { flow1 = new TextFlow(); flow1->blocks = blocks; flow1->lines = blocks->lines; flow1->yMin = blocks->yMin; flow1->yMax = blocks->yMax; blocks = blocks->next; flow1->blocks->next = NULL; if (flow0) { flow0->next = flow1; } else { flows = flow1; } flow0 = flow1; } } else { // compute whitespace above and below each block for (blk0 = blocks; blk0; blk0 = blk0->next) { blk0->ySpaceT = 0; blk0->ySpaceB = pageHeight; // check each horizontally overlapping block for (blk1 = blocks; blk1; blk1 = blk1->next) { if (blk1 != blk0 && blk1->xMin < blk0->xMax && blk1->xMax > blk0->xMin) { if (blk1->yMax < blk0->yMin) { if (blk1->yMax > blk0->ySpaceT) { blk0->ySpaceT = blk1->yMax; } } else if (blk1->yMin > blk0->yMax) { if (blk1->yMin < blk0->ySpaceB) { blk0->ySpaceB = blk1->yMin; } } } } } flow0 = NULL; while (blocks) { // build a new flow object flow1 = new TextFlow(); flow1->blocks = blocks; flow1->lines = blocks->lines; flow1->yMin = blocks->yMin; flow1->yMax = blocks->yMax; flow1->ySpaceT = blocks->ySpaceT; flow1->ySpaceB = blocks->ySpaceB; // find subsequent blocks in the flow for (blk1 = blocks, blk2 = blocks->next; blk2 && flowFit(flow1, blk2); blk1 = blk2, blk2 = blk2->next) { if (blk2->yMin < flow1->yMin) { flow1->yMin = blk2->yMin; } if (blk2->yMax > flow1->yMax) { flow1->yMax = blk2->yMax; } if (blk2->ySpaceT > flow1->ySpaceT) { flow1->ySpaceT = blk2->ySpaceT; } if (blk2->ySpaceB < flow1->ySpaceB) { flow1->ySpaceB = blk2->ySpaceB; } for (line1 = blk1->lines; line1->next; line1 = line1->next) ; line1->flowNext = blk2->lines; } // chop the block list blocks = blk1->next; blk1->next = NULL; // append the flow to the list if (flow0) { flow0->next = flow1; } else { flows = flow1; } flow0 = flow1; } } #if 0 // for debugging printf("*** flows ***\n"); for (flow0 = flows; flow0; flow0 = flow0->next) { printf("[flow]\n"); for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) { printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n", blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax, blk0->ySpaceT, blk0->ySpaceB); for (line0 = blk0->lines; line0; line0 = line0->next) { printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", line0->xMin, line0->xMax, line0->yMin, line0->yMax, line0->yBase, line0->len); for (word0 = line0->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } } } printf("\n"); fflush(stdout); #endif //----- sort lines into yx order // (the block/line merging process doesn't maintain the full-page // linked list of lines) lines = NULL; if (rawOrder) { line0 = NULL; for (flow0 = flows; flow0; flow0 = flow0->next) { for (line1 = flow0->lines; line1; line1 = line1->flowNext) { if (line0) { line0->pageNext = line1; } else { lines = line1; } line0 = line1; } } } else { for (flow0 = flows; flow0; flow0 = flow0->next) { for (line0 = flow0->lines; line0; line0 = line0->flowNext) { for (line1 = NULL, line2 = lines; line2 && !line0->yxBefore(line2); line1 = line2, line2 = line2->pageNext) ; if (line1) { line1->pageNext = line0; } else { lines = line0; } line0->pageNext = line2; } } } #if 0 // for debugging printf("*** lines in yx order ***\n"); for (line0 = lines; line0; line0 = line0->pageNext) { printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n", line0->xMin, line0->xMax, line0->yMin, line0->yMax, line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0], line0->len); for (word0 = line0->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } printf("\n"); fflush(stdout); #endif } // If can be added the end of , return the absolute value // of the difference between 's baseline and 's baseline, // and set * to the horizontal space between the current last // word in and . A smaller return value indicates a // better fit. Otherwise, return a negative number. double TextPage::lineFit(TextLine *line, TextWord *word, double *space) { TextWord *lastWord; double fontSize0, fontSize1; double dx, dxLimit; lastWord = line->lastWord; fontSize0 = line->fontSize; fontSize1 = word->fontSize; dx = word->xMin - lastWord->xMax; dxLimit = fontSize0 * lastWord->font->maxSpaceWidth; // check inter-word spacing if (dx < fontSize0 * lineMinDeltaX || dx > dxLimit) { return -1; } if ( // look for adjacent words with close baselines and close font sizes (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 && fontSize0 < lineMaxFontSizeRatio * fontSize1 && fontSize1 < lineMaxFontSizeRatio * fontSize0) || // look for a superscript (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 && fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 && (word->yMax < lastWord->yMax || word->yBase < lastWord->yBase) && word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 && dx < fontSize0 * lineMaxSuperscriptDeltaX) || // look for a subscript (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 && fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 && (word->yMin > lastWord->yMin || word->yBase > lastWord->yBase) && line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 && dx < fontSize0 * lineMaxSubscriptDeltaX)) { *space = dx; return fabs(word->yBase - line->yBase); } return -1; } // Returns true if and can be merged into a single // line, ignoring max word spacing. GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) { double fontSize0, fontSize1; double dx; fontSize0 = line0->fontSize; fontSize1 = line1->fontSize; dx = line1->xMin - line0->xMax; // check inter-word spacing if (dx < fontSize0 * lineMinDeltaX) { return gFalse; } // look for close baselines and close font sizes if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 && fontSize0 < lineMaxFontSizeRatio * fontSize1 && fontSize1 < lineMaxFontSizeRatio * fontSize0) { return gTrue; } return gFalse; } // Returns true if can be added to . Assumes the y // coordinates are within range. GBool TextPage::blockFit(TextBlock *blk, TextLine *line) { double fontSize0, fontSize1; // check edges if (line->xMin < blk->xSpaceL || line->xMax > blk->xSpaceR || blk->xMin < line->xSpaceL || blk->xMax > line->xSpaceR) { return gFalse; } // check font sizes fontSize0 = blk->lines->fontSize; fontSize1 = line->fontSize; if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || fontSize1 > blkMaxFontSizeRatio * fontSize0) { return gFalse; } return gTrue; } // Returns true if and can be merged into a single // block. Assumes the y coordinates are within range. GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) { double fontSize0, fontSize1; // check edges if (blk1->xMin < blk0->xSpaceL || blk1->xMax > blk0->xSpaceR || blk0->xMin < blk1->xSpaceL || blk0->xMax > blk1->xSpaceR) { return gFalse; } // check font sizes fontSize0 = blk0->lines->fontSize; fontSize1 = blk1->lines->fontSize; if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || fontSize1 > blkMaxFontSizeRatio * fontSize0) { return gFalse; } return gTrue; } // Returns true if can be added to . GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) { double dy; // check whitespace above and below if (blk->yMin < flow->ySpaceT || blk->yMax > flow->ySpaceB || flow->yMin < blk->ySpaceT || flow->yMax > blk->ySpaceB) { return gFalse; } // check that block top edge is within +/- dy of flow top edge, // and that block bottom edge is above flow bottom edge + dy dy = flowMaxDeltaY * flow->blocks->maxFontSize; return blk->yMin > flow->yMin - dy && blk->yMin < flow->yMin + dy && blk->yMax < flow->yMax + dy; } GBool TextPage::findText(Unicode *s, int len, GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { TextLine *line; Unicode *p; Unicode u1, u2; int m, i, j; double x0, x1, x; // scan all text on the page for (line = lines; line; line = line->pageNext) { // check: above top limit? if (!top && (line->yMax < *yMin || (line->yMin < *yMin && line->xMax <= *xMin))) { continue; } // check: below bottom limit? if (!bottom && (line->yMin > *yMax || (line->yMax > *yMax && line->xMin >= *xMax))) { return gFalse; } // search each position in this line m = line->len; for (i = 0, p = line->text; i <= m - len; ++i, ++p) { x0 = (i == 0) ? line->xMin : line->xRight[i-1]; x1 = line->xRight[i]; x = 0.5 * (x0 + x1); // check: above top limit? if (!top && line->yMin < *yMin) { if (x < *xMin) { continue; } } // check: below bottom limit? if (!bottom && line->yMax > *yMax) { if (x > *xMax) { return gFalse; } } // compare the strings for (j = 0; j < len; ++j) { #if 1 //~ this lowercases Latin A-Z only -- this will eventually be //~ extended to handle other character sets if (p[j] >= 0x41 && p[j] <= 0x5a) { u1 = p[j] + 0x20; } else { u1 = p[j]; } if (s[j] >= 0x41 && s[j] <= 0x5a) { u2 = s[j] + 0x20; } else { u2 = s[j]; } #endif if (u1 != u2) { break; } } // found it if (j == len) { *xMin = x0; *xMax = line->xRight[i + len - 1]; *yMin = line->yMin; *yMax = line->yMax; return gTrue; } } } return gFalse; } GString *TextPage::getText(double xMin, double yMin, double xMax, double yMax) { GString *s; UnicodeMap *uMap; GBool isUnicode; char space[8], eol[16], buf[8]; int spaceLen, eolLen, len; TextLine *line, *prevLine; double x0, x1, y; int firstCol, col, i; GBool multiLine; s = new GString(); // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return s; } isUnicode = uMap->isUnicode(); spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = 0; // make gcc happy switch (globalParams->getTextEOL()) { case eolUnix: eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); break; case eolDOS: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); break; case eolMac: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); break; } // find the leftmost column firstCol = -1; for (line = lines; line; line = line->pageNext) { if (line->yMin > yMax) { break; } if (line->yMax < yMin || line->xMax < xMin || line->xMin > xMax) { continue; } y = 0.5 * (line->yMin + line->yMax); if (y < yMin || y > yMax) { continue; } i = 0; while (i < line->len) { x0 = (i==0) ? line->xMin : line->xRight[i-1]; x1 = line->xRight[i]; if (0.5 * (x0 + x1) > xMin) { break; } ++i; } if (i == line->len) { continue; } col = line->col[i]; if (firstCol < 0 || col < firstCol) { firstCol = col; } } // extract the text col = firstCol; multiLine = gFalse; prevLine = NULL; for (line = lines; line; line = line->pageNext) { if (line->yMin > yMax) { break; } if (line->yMax < yMin || line->xMax < xMin || line->xMin > xMax) { continue; } y = 0.5 * (line->yMin + line->yMax); if (y < yMin || y > yMax) { continue; } i = 0; while (i < line->len) { x0 = (i==0) ? line->xMin : line->xRight[i-1]; x1 = line->xRight[i]; if (0.5 * (x0 + x1) > xMin) { break; } ++i; } if (i == line->len) { continue; } // insert a return if (line->col[i] < col || (prevLine && line->yMin > prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) { s->append(eol, eolLen); col = firstCol; multiLine = gTrue; } prevLine = line; // line this block up with the correct column for (; col < line->col[i]; ++col) { s->append(space, spaceLen); } // print the portion of the line for (; i < line->len; ++i) { x0 = (i==0) ? line->xMin : line->xRight[i-1]; x1 = line->xRight[i]; if (0.5 * (x0 + x1) > xMax) { break; } len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); s->append(buf, len); col += isUnicode ? 1 : len; } } if (multiLine) { s->append(eol, eolLen); } uMap->decRefCnt(); return s; } GBool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) { TextLine *line; TextWord *word; double x; GBool first; int i; //~ this doesn't correctly handle: //~ - ranges split across multiple lines (the highlighted region //~ is the bounding box of all the parts of the range) //~ - cases where characters don't convert one-to-one into Unicode first = gTrue; for (line = lines; line; line = line->pageNext) { for (word = line->words; word; word = word->next) { if (pos < word->charPos + word->charLen && word->charPos < pos + length) { i = pos - word->charPos; if (i < 0) { i = 0; } x = (i == 0) ? word->xMin : word->xRight[i - 1]; if (first || x < *xMin) { *xMin = x; } i = pos + length - word->charPos; if (i >= word->len) { i = word->len - 1; } x = word->xRight[i]; if (first || x > *xMax) { *xMax = x; } if (first || word->yMin < *yMin) { *yMin = word->yMin; } if (first || word->yMax > *yMax) { *yMax = word->yMax; } first = gFalse; } } } return !first; } void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, GBool physLayout) { UnicodeMap *uMap; char space[8], eol[16], eop[8], buf[8]; int spaceLen, eolLen, eopLen, len; TextFlow *flow; TextLine *line; int col, d, n, i; // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return; } spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = 0; // make gcc happy switch (globalParams->getTextEOL()) { case eolUnix: eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); break; case eolDOS: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); break; case eolMac: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); break; } eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); // output the page, maintaining the original physical layout if (physLayout || rawOrder) { col = 0; for (line = lines; line; line = line->pageNext) { // line this block up with the correct column if (!rawOrder) { for (; col < line->col[0]; ++col) { (*outputFunc)(outputStream, space, spaceLen); } } // print the line for (i = 0; i < line->len; ++i) { len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); (*outputFunc)(outputStream, buf, len); } col += line->convertedLen; // print one or more returns if necessary if (rawOrder || !line->pageNext || line->pageNext->col[0] < col || line->pageNext->yMin > line->yMax - lineOverlapSlack * line->fontSize) { // compute number of returns d = 1; if (line->pageNext) { d += (int)((line->pageNext->yMin - line->yMax) / line->fontSize + 0.5); } // various things (weird font matrices) can result in bogus // values here, so do a sanity check if (d < 1) { d = 1; } else if (d > 5) { d = 5; } for (; d > 0; --d) { (*outputFunc)(outputStream, eol, eolLen); } col = 0; } } // output the page, "undoing" the layout } else { for (flow = flows; flow; flow = flow->next) { for (line = flow->lines; line; line = line->flowNext) { n = line->len; if (line->flowNext && line->hyphenated) { --n; } for (i = 0; i < n; ++i) { len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); (*outputFunc)(outputStream, buf, len); } if (line->flowNext && !line->hyphenated) { (*outputFunc)(outputStream, space, spaceLen); } } (*outputFunc)(outputStream, eol, eolLen); (*outputFunc)(outputStream, eol, eolLen); } } // end of page (*outputFunc)(outputStream, eop, eopLen); (*outputFunc)(outputStream, eol, eolLen); uMap->decRefCnt(); } void TextPage::startPage(GfxState *state) { clear(); if (state) { pageWidth = state->getPageWidth(); pageHeight = state->getPageHeight(); } else { pageWidth = pageHeight = 0; } } void TextPage::clear() { TextWord *w1, *w2; TextFlow *f1, *f2; if (curWord) { delete curWord; curWord = NULL; } if (words) { for (w1 = words; w1; w1 = w2) { w2 = w1->next; delete w1; } } else if (flows) { for (f1 = flows; f1; f1 = f2) { f2 = f1->next; delete f1; } } deleteGList(fonts, TextFontInfo); curWord = NULL; charPos = 0; font = NULL; fontSize = 0; nest = 0; nTinyChars = 0; words = wordPtr = NULL; lines = NULL; flows = NULL; fonts = new GList(); } //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ static void outputToFile(void *stream, char *text, int len) { fwrite(text, 1, len, (FILE *)stream); } TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, GBool rawOrderA, GBool append) { text = NULL; physLayout = physLayoutA; rawOrder = rawOrderA; ok = gTrue; // open file needClose = gFalse; if (fileName) { if (!strcmp(fileName, "-")) { outputStream = stdout; #ifdef WIN32 // keep DOS from munging the end-of-line characters setmode(fileno(stdout), O_BINARY); #endif } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) { needClose = gTrue; } else { error(-1, "Couldn't open text file '%s'", fileName); ok = gFalse; return; } outputFunc = &outputToFile; } else { outputStream = NULL; } // set up text object text = new TextPage(rawOrderA); } TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, GBool physLayoutA, GBool rawOrderA) { outputFunc = func; outputStream = stream; needClose = gFalse; physLayout = physLayoutA; rawOrder = rawOrderA; text = new TextPage(rawOrderA); ok = gTrue; } TextOutputDev::~TextOutputDev() { if (needClose) { #ifdef MACOS ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle); #endif fclose((FILE *)outputStream); } if (text) { delete text; } } void TextOutputDev::startPage(int pageNum, GfxState *state) { text->startPage(state); } void TextOutputDev::endPage() { text->coalesce(physLayout); if (outputStream) { text->dump(outputStream, outputFunc, physLayout); } } void TextOutputDev::updateFont(GfxState *state) { text->updateFont(state); } void TextOutputDev::beginString(GfxState *state, GString *s) { text->beginWord(state, state->getCurX(), state->getCurY()); } void TextOutputDev::endString(GfxState *state) { text->endWord(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, Unicode *u, int uLen) { text->addChar(state, x, y, dx, dy, c, u, uLen); } GBool TextOutputDev::findText(Unicode *s, int len, GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax); } GString *TextOutputDev::getText(double xMin, double yMin, double xMax, double yMax) { return text->getText(xMin, yMin, xMax, yMax); } GBool TextOutputDev::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) { return text->findCharRange(pos, length, xMin, yMin, xMax, yMax); }