//======================================================================== // // TextOutputDev.cc // // Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== #include #ifdef USE_GCC_PRAGMAS #pragma implementation #endif #include #include #include #include #include #ifdef WIN32 #include // for O_BINARY #include // for setmode #endif #include "gmem.h" #include "GString.h" #include "GList.h" #include "config.h" #include "Error.h" #include "GlobalParams.h" #include "UnicodeMap.h" #include "UnicodeTypeTable.h" #include "GfxState.h" #include "TextOutputDev.h" #ifdef MACOS // needed for setting type/creator of MacOS files #include "ICSupport.h" #endif //------------------------------------------------------------------------ // parameters //------------------------------------------------------------------------ // Each bucket in a text pool includes baselines within a range of // this many points. #define textPoolStep 4 // Inter-character space width which will cause addChar to break up a // text string. #define defaultSpaceWidth 0.25 // Max distance between baselines of two lines within a block, as a // fraction of the font size. #define maxLineSpacingDelta 1.5 // Max difference in primary font sizes on two lines in the same // block. Delta1 is used when examining new lines above and below the // current block; delta2 is used when examining text that overlaps the // current block; delta3 is used when examining text to the left and // right of the current block. #define maxBlockFontSizeDelta1 0.05 #define maxBlockFontSizeDelta2 0.6 #define maxBlockFontSizeDelta3 0.2 // Max difference in font sizes inside a word. #define maxWordFontSizeDelta 0.05 // Maximum distance between baselines of two words on the same line, // e.g., distance between subscript or superscript and the primary // baseline, as a fraction of the font size. #define maxIntraLineDelta 0.5 // Minimum inter-word spacing, as a fraction of the font size. (Only // used for raw ordering.) #define minWordSpacing 0.2 // Maximum inter-word spacing, as a fraction of the font size. #define maxWordSpacing 1.5 // Minimum spacing between columns, as a fraction of the font size. #define minColSpacing 1.0 // Maximum vertical spacing between blocks within a flow, as a // multiple of the font size. #define maxBlockSpacing 2.5 // Minimum spacing between characters within a word, as a fraction of // the font size. #define minCharSpacing -0.2 // Maximum spacing between characters within a word, as a fraction of // the font size, when there is no obvious extra-wide character // spacing. #define maxCharSpacing 0.03 // When extra-wide character spacing is detected, the inter-character // space threshold is set to the minimum inter-character space // multiplied by this constant. #define maxWideCharSpacingMul 1.3 // Max difference in primary,secondary coordinates (as a fraction of // the font size) allowed for duplicated text (fake boldface, drop // shadows) which is to be discarded. #define dupMaxPriDelta 0.1 #define dupMaxSecDelta 0.2 //------------------------------------------------------------------------ // TextFontInfo //------------------------------------------------------------------------ TextFontInfo::TextFontInfo(GfxState *state) { gfxFont = state->getFont(); #if TEXTOUT_WORD_LIST fontName = (gfxFont && gfxFont->getOrigName()) ? gfxFont->getOrigName()->copy() : (GString *)NULL; #endif } TextFontInfo::~TextFontInfo() { #if TEXTOUT_WORD_LIST if (fontName) { delete fontName; } #endif } GBool TextFontInfo::matches(GfxState *state) { return state->getFont() == gfxFont; } //------------------------------------------------------------------------ // TextWord //------------------------------------------------------------------------ TextWord::TextWord(GfxState *state, int rotA, double x0, double y0, int charPosA, TextFontInfo *fontA, double fontSizeA) { GfxFont *gfxFont; double x, y, ascent, descent; rot = rotA; charPos = charPosA; charLen = 0; font = fontA; fontSize = fontSizeA; state->transform(x0, y0, &x, &y); if ((gfxFont = font->gfxFont)) { ascent = gfxFont->getAscent() * fontSize; descent = gfxFont->getDescent() * fontSize; } else { // this means that the PDF file draws text without a current font, // which should never happen ascent = 0.95 * fontSize; descent = -0.35 * fontSize; } switch (rot) { case 0: yMin = y - ascent; yMax = y - descent; if (yMin == yMax) { // this is a sanity check for a case that shouldn't happen -- but // if it does happen, we want to avoid dividing by zero later yMin = y; yMax = y + 1; } base = y; break; case 1: xMin = x + descent; xMax = x + ascent; if (xMin == xMax) { // this is a sanity check for a case that shouldn't happen -- but // if it does happen, we want to avoid dividing by zero later xMin = x; xMax = x + 1; } base = x; break; case 2: yMin = y + descent; yMax = y + ascent; if (yMin == yMax) { // this is a sanity check for a case that shouldn't happen -- but // if it does happen, we want to avoid dividing by zero later yMin = y; yMax = y + 1; } base = y; break; case 3: xMin = x - ascent; xMax = x - descent; if (xMin == xMax) { // this is a sanity check for a case that shouldn't happen -- but // if it does happen, we want to avoid dividing by zero later xMin = x; xMax = x + 1; } base = x; break; } text = NULL; edge = NULL; len = size = 0; spaceAfter = gFalse; next = NULL; #if TEXTOUT_WORD_LIST GfxRGB rgb; if ((state->getRender() & 3) == 1) { state->getStrokeRGB(&rgb); } else { state->getFillRGB(&rgb); } colorR = rgb.r; colorG = rgb.g; colorB = rgb.b; #endif } TextWord::~TextWord() { gfree(text); gfree(edge); } void TextWord::addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u) { if (len == size) { size += 16; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); edge = (double *)grealloc(edge, (size + 1) * sizeof(double)); } text[len] = u; switch (rot) { case 0: if (len == 0) { xMin = x; } edge[len] = x; xMax = edge[len+1] = x + dx; break; case 1: if (len == 0) { yMin = y; } edge[len] = y; yMax = edge[len+1] = y + dy; break; case 2: if (len == 0) { xMax = x; } edge[len] = x; xMin = edge[len+1] = x + dx; break; case 3: if (len == 0) { yMax = y; } edge[len] = y; yMin = edge[len+1] = y + dy; break; } ++len; } void TextWord::merge(TextWord *word) { int i; if (word->xMin < xMin) { xMin = word->xMin; } if (word->yMin < yMin) { yMin = word->yMin; } if (word->xMax > xMax) { xMax = word->xMax; } if (word->yMax > yMax) { yMax = word->yMax; } if (len + word->len > size) { size = len + word->len; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); edge = (double *)grealloc(edge, (size + 1) * sizeof(double)); } for (i = 0; i < word->len; ++i) { text[len + i] = word->text[i]; edge[len + i] = word->edge[i]; } edge[len + word->len] = word->edge[word->len]; len += word->len; charLen += word->charLen; } inline int TextWord::primaryCmp(TextWord *word) { double cmp; cmp = 0; // make gcc happy switch (rot) { case 0: cmp = xMin - word->xMin; break; case 1: cmp = yMin - word->yMin; break; case 2: cmp = word->xMax - xMax; break; case 3: cmp = word->yMax - yMax; break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } double TextWord::primaryDelta(TextWord *word) { double delta; delta = 0; // make gcc happy switch (rot) { case 0: delta = word->xMin - xMax; break; case 1: delta = word->yMin - yMax; break; case 2: delta = xMin - word->xMax; break; case 3: delta = yMin - word->yMax; break; } return delta; } int TextWord::cmpYX(const void *p1, const void *p2) { TextWord *word1 = *(TextWord **)p1; TextWord *word2 = *(TextWord **)p2; double cmp; cmp = word1->yMin - word2->yMin; if (cmp == 0) { cmp = word1->xMin - word2->xMin; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } #if TEXTOUT_WORD_LIST GString *TextWord::getText() { GString *s; UnicodeMap *uMap; char buf[8]; int n, i; s = new GString(); if (!(uMap = globalParams->getTextEncoding())) { return s; } for (i = 0; i < len; ++i) { n = uMap->mapUnicode(text[i], buf, sizeof(buf)); s->append(buf, n); } uMap->decRefCnt(); return s; } #endif // TEXTOUT_WORD_LIST //------------------------------------------------------------------------ // TextPool //------------------------------------------------------------------------ TextPool::TextPool() { minBaseIdx = 0; maxBaseIdx = -1; pool = NULL; cursor = NULL; cursorBaseIdx = -1; } TextPool::~TextPool() { int baseIdx; TextWord *word, *word2; for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { for (word = pool[baseIdx - minBaseIdx]; word; word = word2) { word2 = word->next; delete word; } } gfree(pool); } int TextPool::getBaseIdx(double base) { int baseIdx; baseIdx = (int)(base / textPoolStep); if (baseIdx < minBaseIdx) { return minBaseIdx; } if (baseIdx > maxBaseIdx) { return maxBaseIdx; } return baseIdx; } void TextPool::addWord(TextWord *word) { TextWord **newPool; int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx; TextWord *w0, *w1; // expand the array if needed wordBaseIdx = (int)(word->base / textPoolStep); if (minBaseIdx > maxBaseIdx) { minBaseIdx = wordBaseIdx - 128; maxBaseIdx = wordBaseIdx + 128; pool = (TextWord **)gmalloc((maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *)); for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { pool[baseIdx - minBaseIdx] = NULL; } } else if (wordBaseIdx < minBaseIdx) { newMinBaseIdx = wordBaseIdx - 128; newPool = (TextWord **)gmalloc((maxBaseIdx - newMinBaseIdx + 1) * sizeof(TextWord *)); for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) { newPool[baseIdx - newMinBaseIdx] = NULL; } memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool, (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *)); gfree(pool); pool = newPool; minBaseIdx = newMinBaseIdx; } else if (wordBaseIdx > maxBaseIdx) { newMaxBaseIdx = wordBaseIdx + 128; pool = (TextWord **)grealloc(pool, (newMaxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *)); for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) { pool[baseIdx - minBaseIdx] = NULL; } maxBaseIdx = newMaxBaseIdx; } // insert the new word if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(cursor) > 0) { w0 = cursor; w1 = cursor->next; } else { w0 = NULL; w1 = pool[wordBaseIdx - minBaseIdx]; } for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ; word->next = w1; if (w0) { w0->next = word; } else { pool[wordBaseIdx - minBaseIdx] = word; } cursor = word; cursorBaseIdx = wordBaseIdx; } //------------------------------------------------------------------------ // TextLine //------------------------------------------------------------------------ TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) { blk = blkA; rot = rotA; xMin = yMin = 0; xMax = yMax = -1; base = baseA; words = lastWord = NULL; text = NULL; edge = NULL; col = NULL; len = 0; convertedLen = 0; hyphenated = gFalse; next = NULL; } TextLine::~TextLine() { TextWord *word; while (words) { word = words; words = words->next; delete word; } gfree(text); gfree(edge); gfree(col); } void TextLine::addWord(TextWord *word) { if (lastWord) { lastWord->next = word; } else { words = word; } lastWord = word; if (xMin > xMax) { xMin = word->xMin; xMax = word->xMax; yMin = word->yMin; yMax = word->yMax; } else { if (word->xMin < xMin) { xMin = word->xMin; } if (word->xMax > xMax) { xMax = word->xMax; } if (word->yMin < yMin) { yMin = word->yMin; } if (word->yMax > yMax) { yMax = word->yMax; } } } double TextLine::primaryDelta(TextLine *line) { double delta; delta = 0; // make gcc happy switch (rot) { case 0: delta = line->xMin - xMax; break; case 1: delta = line->yMin - yMax; break; case 2: delta = xMin - line->xMax; break; case 3: delta = yMin - line->yMax; break; } return delta; } int TextLine::primaryCmp(TextLine *line) { double cmp; cmp = 0; // make gcc happy switch (rot) { case 0: cmp = xMin - line->xMin; break; case 1: cmp = yMin - line->yMin; break; case 2: cmp = line->xMax - xMax; break; case 3: cmp = line->yMax - yMax; break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } int TextLine::secondaryCmp(TextLine *line) { double cmp; cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base; return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } int TextLine::cmpYX(TextLine *line) { int cmp; if ((cmp = secondaryCmp(line))) { return cmp; } return primaryCmp(line); } int TextLine::cmpXY(const void *p1, const void *p2) { TextLine *line1 = *(TextLine **)p1; TextLine *line2 = *(TextLine **)p2; int cmp; if ((cmp = line1->primaryCmp(line2))) { return cmp; } return line1->secondaryCmp(line2); } void TextLine::coalesce(UnicodeMap *uMap) { TextWord *word0, *word1; double space, delta, minSpace; GBool isUnicode; char buf[8]; int i, j; if (words->next) { // compute the inter-word space threshold if (words->len > 1 || words->next->len > 1) { minSpace = 0; } else { minSpace = words->primaryDelta(words->next); for (word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) { if (word1->len > 1) { minSpace = 0; } delta = word0->primaryDelta(word1); if (delta < minSpace) { minSpace = delta; } } } if (minSpace <= 0) { space = maxCharSpacing * words->fontSize; } else { space = maxWideCharSpacingMul * minSpace; } // merge words word0 = words; word1 = words->next; while (word1) { if (word0->primaryDelta(word1) >= space) { word0->spaceAfter = gTrue; word0 = word1; word1 = word1->next; } else if (word0->font == word1->font && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize && word1->charPos == word0->charPos + word0->charLen) { word0->merge(word1); word0->next = word1->next; delete word1; word1 = word0->next; } else { word0 = word1; word1 = word1->next; } } } // build the line text isUnicode = uMap ? uMap->isUnicode() : gFalse; len = 0; for (word1 = words; word1; word1 = word1->next) { len += word1->len; if (word1->spaceAfter) { ++len; } } text = (Unicode *)gmalloc(len * sizeof(Unicode)); edge = (double *)gmalloc((len + 1) * sizeof(double)); i = 0; for (word1 = words; word1; word1 = word1->next) { for (j = 0; j < word1->len; ++j) { text[i] = word1->text[j]; edge[i] = word1->edge[j]; ++i; } edge[i] = word1->edge[word1->len]; if (word1->spaceAfter) { text[i] = (Unicode)0x0020; ++i; } } // compute convertedLen and set up the col array col = (int *)gmalloc((len + 1) * sizeof(int)); convertedLen = 0; for (i = 0; i < len; ++i) { col[i] = convertedLen; if (isUnicode) { ++convertedLen; } else if (uMap) { convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf)); } } col[len] = convertedLen; // check for hyphen at end of line //~ need to check for other chars used as hyphens hyphenated = text[len - 1] == (Unicode)'-'; } //------------------------------------------------------------------------ // TextLineFrag //------------------------------------------------------------------------ class TextLineFrag { public: TextLine *line; // the line object int start, len; // offset and length of this fragment // (in Unicode chars) double xMin, xMax; // bounding box coordinates double yMin, yMax; double base; // baseline virtual coordinate int col; // first column void init(TextLine *lineA, int startA, int lenA); void computeCoords(GBool oneRot); static int cmpYXPrimaryRot(const void *p1, const void *p2); static int cmpYXLineRot(const void *p1, const void *p2); static int cmpXYLineRot(const void *p1, const void *p2); }; void TextLineFrag::init(TextLine *lineA, int startA, int lenA) { line = lineA; start = startA; len = lenA; col = line->col[start]; } void TextLineFrag::computeCoords(GBool oneRot) { TextBlock *blk; double d0, d1, d2, d3, d4; if (oneRot) { switch (line->rot) { case 0: xMin = line->edge[start]; xMax = line->edge[start + len]; yMin = line->yMin; yMax = line->yMax; break; case 1: xMin = line->xMin; xMax = line->xMax; yMin = line->edge[start]; yMax = line->edge[start + len]; break; case 2: xMin = line->edge[start + len]; xMax = line->edge[start]; yMin = line->yMin; yMax = line->yMax; break; case 3: xMin = line->xMin; xMax = line->xMax; yMin = line->edge[start + len]; yMax = line->edge[start]; break; } base = line->base; } else { if (line->rot == 0 && line->blk->page->primaryRot == 0) { xMin = line->edge[start]; xMax = line->edge[start + len]; yMin = line->yMin; yMax = line->yMax; base = line->base; } else { blk = line->blk; d0 = line->edge[start]; d1 = line->edge[start + len]; d2 = d3 = d4 = 0; // make gcc happy switch (line->rot) { case 0: d2 = line->yMin; d3 = line->yMax; d4 = line->base; d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin); d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin); d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin); d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin); d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin); break; case 1: d2 = line->xMax; d3 = line->xMin; d4 = line->base; d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin); d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin); d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin); d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin); d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin); break; case 2: d2 = line->yMax; d3 = line->yMin; d4 = line->base; d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin); d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin); d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin); d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin); d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin); break; case 3: d2 = line->xMin; d3 = line->xMax; d4 = line->base; d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin); d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin); d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin); d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin); d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin); break; } switch (line->blk->page->primaryRot) { case 0: xMin = blk->xMin + d0 * (blk->xMax - blk->xMin); xMax = blk->xMin + d1 * (blk->xMax - blk->xMin); yMin = blk->yMin + d2 * (blk->yMax - blk->yMin); yMax = blk->yMin + d3 * (blk->yMax - blk->yMin); base = blk->yMin + base * (blk->yMax - blk->yMin); break; case 1: xMin = blk->xMax - d3 * (blk->xMax - blk->xMin); xMax = blk->xMax - d2 * (blk->xMax - blk->xMin); yMin = blk->yMin + d0 * (blk->yMax - blk->yMin); yMax = blk->yMin + d1 * (blk->yMax - blk->yMin); base = blk->xMax - d4 * (blk->xMax - blk->xMin); break; case 2: xMin = blk->xMax - d1 * (blk->xMax - blk->xMin); xMax = blk->xMax - d0 * (blk->xMax - blk->xMin); yMin = blk->yMax - d3 * (blk->yMax - blk->yMin); yMax = blk->yMax - d2 * (blk->yMax - blk->yMin); base = blk->yMax - d4 * (blk->yMax - blk->yMin); break; case 3: xMin = blk->xMin + d2 * (blk->xMax - blk->xMin); xMax = blk->xMin + d3 * (blk->xMax - blk->xMin); yMin = blk->yMax - d1 * (blk->yMax - blk->yMin); yMax = blk->yMax - d0 * (blk->yMax - blk->yMin); base = blk->xMin + d4 * (blk->xMax - blk->xMin); break; } } } } int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) { TextLineFrag *frag1 = (TextLineFrag *)p1; TextLineFrag *frag2 = (TextLineFrag *)p2; double cmp; cmp = 0; // make gcc happy switch (frag1->line->blk->page->primaryRot) { case 0: if ((cmp = frag1->yMin - frag2->yMin) == 0) { cmp = frag1->xMin - frag2->xMin; } break; case 1: if ((cmp = frag2->xMax - frag1->xMax) == 0) { cmp = frag1->yMin - frag2->yMin; } break; case 2: if ((cmp = frag2->yMin - frag1->yMin) == 0) { cmp = frag2->xMax - frag1->xMax; } break; case 3: if ((cmp = frag1->xMax - frag2->xMax) == 0) { cmp = frag2->yMax - frag1->yMax; } break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) { TextLineFrag *frag1 = (TextLineFrag *)p1; TextLineFrag *frag2 = (TextLineFrag *)p2; double cmp; cmp = 0; // make gcc happy switch (frag1->line->rot) { case 0: if ((cmp = frag1->yMin - frag2->yMin) == 0) { cmp = frag1->xMin - frag2->xMin; } break; case 1: if ((cmp = frag2->xMax - frag1->xMax) == 0) { cmp = frag1->yMin - frag2->yMin; } break; case 2: if ((cmp = frag2->yMin - frag1->yMin) == 0) { cmp = frag2->xMax - frag1->xMax; } break; case 3: if ((cmp = frag1->xMax - frag2->xMax) == 0) { cmp = frag2->yMax - frag1->yMax; } break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) { TextLineFrag *frag1 = (TextLineFrag *)p1; TextLineFrag *frag2 = (TextLineFrag *)p2; double cmp; cmp = 0; // make gcc happy switch (frag1->line->rot) { case 0: if ((cmp = frag1->xMin - frag2->xMin) == 0) { cmp = frag1->yMin - frag2->yMin; } break; case 1: if ((cmp = frag1->yMin - frag2->yMin) == 0) { cmp = frag2->xMax - frag1->xMax; } break; case 2: if ((cmp = frag2->xMax - frag1->xMax) == 0) { cmp = frag2->yMin - frag1->yMin; } break; case 3: if ((cmp = frag2->yMax - frag1->yMax) == 0) { cmp = frag1->xMax - frag2->xMax; } break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } //------------------------------------------------------------------------ // TextBlock //------------------------------------------------------------------------ TextBlock::TextBlock(TextPage *pageA, int rotA) { page = pageA; rot = rotA; xMin = yMin = 0; xMax = yMax = -1; priMin = 0; priMax = page->pageWidth; pool = new TextPool(); lines = NULL; curLine = NULL; next = NULL; stackNext = NULL; } TextBlock::~TextBlock() { TextLine *line; delete pool; while (lines) { line = lines; lines = lines->next; delete line; } } void TextBlock::addWord(TextWord *word) { pool->addWord(word); if (xMin > xMax) { xMin = word->xMin; xMax = word->xMax; yMin = word->yMin; yMax = word->yMax; } else { if (word->xMin < xMin) { xMin = word->xMin; } if (word->xMax > xMax) { xMax = word->xMax; } if (word->yMin < yMin) { yMin = word->yMin; } if (word->yMax > yMax) { yMax = word->yMax; } } } void TextBlock::coalesce(UnicodeMap *uMap) { TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord; TextLine *line, *line0, *line1; int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx; int baseIdx, bestWordBaseIdx, idx0, idx1; double minBase, maxBase; double fontSize, delta, priDelta, secDelta; TextLine **lineArray; GBool found; int col1, col2; int i, j, k; // discard duplicated text (fake boldface, drop shadows) for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) { word0 = pool->getPool(idx0); while (word0) { priDelta = dupMaxPriDelta * word0->fontSize; secDelta = dupMaxSecDelta * word0->fontSize; if (rot == 0 || rot == 3) { maxBaseIdx = pool->getBaseIdx(word0->base + secDelta); } else { maxBaseIdx = pool->getBaseIdx(word0->base - secDelta); } found = gFalse; word1 = word2 = NULL; // make gcc happy for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) { if (idx1 == idx0) { word1 = word0; word2 = word0->next; } else { word1 = NULL; word2 = pool->getPool(idx1); } for (; word2; word1 = word2, word2 = word2->next) { if (word2->len == word0->len && !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode))) { switch (rot) { case 0: case 2: found = fabs(word0->xMin - word2->xMin) < priDelta && fabs(word0->xMax - word2->xMax) < priDelta && fabs(word0->yMin - word2->yMin) < secDelta && fabs(word0->yMax - word2->yMax) < secDelta; break; case 1: case 3: found = fabs(word0->xMin - word2->xMin) < secDelta && fabs(word0->xMax - word2->xMax) < secDelta && fabs(word0->yMin - word2->yMin) < priDelta && fabs(word0->yMax - word2->yMax) < priDelta; break; } } if (found) { break; } } if (found) { break; } } if (found) { if (word1) { word1->next = word2->next; } else { pool->setPool(idx1, word2->next); } delete word2; } else { word0 = word0->next; } } } // build the lines curLine = NULL; poolMinBaseIdx = pool->minBaseIdx; charCount = 0; nLines = 0; while (1) { // find the first non-empty line in the pool for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx) ; if (poolMinBaseIdx > pool->maxBaseIdx) { break; } // look for the left-most word in the first four lines of the // pool -- this avoids starting with a superscript word startBaseIdx = poolMinBaseIdx; for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) { if (!pool->getPool(baseIdx)) { continue; } if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) { startBaseIdx = baseIdx; } } // create a new line word0 = pool->getPool(startBaseIdx); pool->setPool(startBaseIdx, word0->next); word0->next = NULL; line = new TextLine(this, word0->rot, word0->base); line->addWord(word0); lastWord = word0; // compute the search range fontSize = word0->fontSize; minBase = word0->base - maxIntraLineDelta * fontSize; maxBase = word0->base + maxIntraLineDelta * fontSize; minBaseIdx = pool->getBaseIdx(minBase); maxBaseIdx = pool->getBaseIdx(maxBase); // find the rest of the words in this line while (1) { // find the left-most word whose baseline is in the range for // this line bestWordBaseIdx = 0; bestWord0 = bestWord1 = NULL; for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) { for (word0 = NULL, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) { if (word1->base >= minBase && word1->base <= maxBase && (delta = lastWord->primaryDelta(word1)) >= minCharSpacing * fontSize) { if (delta < maxWordSpacing * fontSize && (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) { bestWordBaseIdx = baseIdx; bestWord0 = word0; bestWord1 = word1; } break; } } } if (!bestWord1) { break; } // remove it from the pool, and add it to the line if (bestWord0) { bestWord0->next = bestWord1->next; } else { pool->setPool(bestWordBaseIdx, bestWord1->next); } bestWord1->next = NULL; line->addWord(bestWord1); lastWord = bestWord1; } // add the line if (curLine && line->cmpYX(curLine) > 0) { line0 = curLine; line1 = curLine->next; } else { line0 = NULL; line1 = lines; } for (; line1 && line->cmpYX(line1) > 0; line0 = line1, line1 = line1->next) ; if (line0) { line0->next = line; } else { lines = line; } line->next = line1; curLine = line; line->coalesce(uMap); charCount += line->len; ++nLines; } // sort lines into xy order for column assignment lineArray = (TextLine **)gmalloc(nLines * sizeof(TextLine *)); for (line = lines, i = 0; line; line = line->next, ++i) { lineArray[i] = line; } qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY); // column assignment nColumns = 0; for (i = 0; i < nLines; ++i) { line0 = lineArray[i]; col1 = 0; for (j = 0; j < i; ++j) { line1 = lineArray[j]; if (line1->primaryDelta(line0) >= 0) { col2 = line1->col[line1->len] + 1; } else { k = 0; // make gcc happy switch (rot) { case 0: for (k = 0; k < line1->len && line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); ++k) ; break; case 1: for (k = 0; k < line1->len && line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]); ++k) ; break; case 2: for (k = 0; k < line1->len && line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); ++k) ; break; case 3: for (k = 0; k < line1->len && line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]); ++k) ; break; } col2 = line1->col[k]; } if (col2 > col1) { col1 = col2; } } for (k = 0; k <= line0->len; ++k) { line0->col[k] += col1; } if (line0->col[line0->len] > nColumns) { nColumns = line0->col[line0->len]; } } gfree(lineArray); } void TextBlock::updatePriMinMax(TextBlock *blk) { double newPriMin, newPriMax; GBool gotPriMin, gotPriMax; gotPriMin = gotPriMax = gFalse; newPriMin = newPriMax = 0; // make gcc happy switch (page->primaryRot) { case 0: case 2: if (blk->yMin < yMax && blk->yMax > yMin) { if (blk->xMin < xMin) { newPriMin = blk->xMax; gotPriMin = gTrue; } if (blk->xMax > xMax) { newPriMax = blk->xMin; gotPriMax = gTrue; } } break; case 1: case 3: if (blk->xMin < xMax && blk->xMax > xMin) { if (blk->yMin < yMin) { newPriMin = blk->yMax; gotPriMin = gTrue; } if (blk->yMax > yMax) { newPriMax = blk->yMin; gotPriMax = gTrue; } } break; } if (gotPriMin) { if (newPriMin > xMin) { newPriMin = xMin; } if (newPriMin > priMin) { priMin = newPriMin; } } if (gotPriMax) { if (newPriMax < xMax) { newPriMax = xMax; } if (newPriMax < priMax) { priMax = newPriMax; } } } int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) { TextBlock *blk1 = *(TextBlock **)p1; TextBlock *blk2 = *(TextBlock **)p2; double cmp; cmp = 0; // make gcc happy switch (blk1->page->primaryRot) { case 0: if ((cmp = blk1->xMin - blk2->xMin) == 0) { cmp = blk1->yMin - blk2->yMin; } break; case 1: if ((cmp = blk1->yMin - blk2->yMin) == 0) { cmp = blk2->xMax - blk1->xMax; } break; case 2: if ((cmp = blk2->xMax - blk1->xMax) == 0) { cmp = blk2->yMin - blk1->yMin; } break; case 3: if ((cmp = blk2->yMax - blk1->yMax) == 0) { cmp = blk1->xMax - blk2->xMax; } break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) { TextBlock *blk1 = *(TextBlock **)p1; TextBlock *blk2 = *(TextBlock **)p2; double cmp; cmp = 0; // make gcc happy switch (blk1->page->primaryRot) { case 0: if ((cmp = blk1->yMin - blk2->yMin) == 0) { cmp = blk1->xMin - blk2->xMin; } break; case 1: if ((cmp = blk2->xMax - blk1->xMax) == 0) { cmp = blk1->yMin - blk2->yMin; } break; case 2: if ((cmp = blk2->yMin - blk1->yMin) == 0) { cmp = blk2->xMax - blk1->xMax; } break; case 3: if ((cmp = blk1->xMax - blk2->xMax) == 0) { cmp = blk2->yMax - blk1->yMax; } break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } int TextBlock::primaryCmp(TextBlock *blk) { double cmp; cmp = 0; // make gcc happy switch (rot) { case 0: cmp = xMin - blk->xMin; break; case 1: cmp = yMin - blk->yMin; break; case 2: cmp = blk->xMax - xMax; break; case 3: cmp = blk->yMax - yMax; break; } return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; } double TextBlock::secondaryDelta(TextBlock *blk) { double delta; delta = 0; // make gcc happy switch (rot) { case 0: delta = blk->yMin - yMax; break; case 1: delta = xMin - blk->xMax; break; case 2: delta = yMin - blk->yMax; break; case 3: delta = blk->xMin - xMax; break; } return delta; } GBool TextBlock::isBelow(TextBlock *blk) { GBool below; below = gFalse; // make gcc happy switch (page->primaryRot) { case 0: below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin; break; case 1: below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax; break; case 2: below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax; break; case 3: below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin; break; } return below; } //------------------------------------------------------------------------ // TextFlow //------------------------------------------------------------------------ TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) { page = pageA; xMin = blk->xMin; xMax = blk->xMax; yMin = blk->yMin; yMax = blk->yMax; priMin = blk->priMin; priMax = blk->priMax; blocks = lastBlk = blk; next = NULL; } TextFlow::~TextFlow() { TextBlock *blk; while (blocks) { blk = blocks; blocks = blocks->next; delete blk; } } void TextFlow::addBlock(TextBlock *blk) { if (lastBlk) { lastBlk->next = blk; } else { blocks = blk; } lastBlk = blk; if (blk->xMin < xMin) { xMin = blk->xMin; } if (blk->xMax > xMax) { xMax = blk->xMax; } if (blk->yMin < yMin) { yMin = blk->yMin; } if (blk->yMax > yMax) { yMax = blk->yMax; } } GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) { GBool fits; // lower blocks must use smaller fonts if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) { return gFalse; } fits = gFalse; // make gcc happy switch (page->primaryRot) { case 0: fits = blk->xMin >= priMin && blk->xMax <= priMax; break; case 1: fits = blk->yMin >= priMin && blk->yMax <= priMax; break; case 2: fits = blk->xMin >= priMin && blk->xMax <= priMax; break; case 3: fits = blk->yMin >= priMin && blk->yMax <= priMax; break; } return fits; } #if TEXTOUT_WORD_LIST //------------------------------------------------------------------------ // TextWordList //------------------------------------------------------------------------ TextWordList::TextWordList(TextPage *text, GBool physLayout) { TextFlow *flow; TextBlock *blk; TextLine *line; TextWord *word; TextWord **wordArray; int nWords, i; words = new GList(); if (text->rawOrder) { for (word = text->rawWords; word; word = word->next) { words->append(word); } } else if (physLayout) { // this is inefficient, but it's also the least useful of these // three cases nWords = 0; for (flow = text->flows; flow; flow = flow->next) { for (blk = flow->blocks; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { for (word = line->words; word; word = word->next) { ++nWords; } } } } wordArray = (TextWord **)gmalloc(nWords * sizeof(TextWord *)); i = 0; for (flow = text->flows; flow; flow = flow->next) { for (blk = flow->blocks; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { for (word = line->words; word; word = word->next) { wordArray[i++] = word; } } } } qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX); for (i = 0; i < nWords; ++i) { words->append(wordArray[i]); } gfree(wordArray); } else { for (flow = text->flows; flow; flow = flow->next) { for (blk = flow->blocks; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { for (word = line->words; word; word = word->next) { words->append(word); } } } } } } TextWordList::~TextWordList() { delete words; } int TextWordList::getLength() { return words->getLength(); } TextWord *TextWordList::get(int idx) { if (idx < 0 || idx >= words->getLength()) { return NULL; } return (TextWord *)words->get(idx); } #endif // TEXTOUT_WORD_LIST //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ TextPage::TextPage(GBool rawOrderA) { int rot; rawOrder = rawOrderA; curWord = NULL; charPos = 0; curFont = NULL; curFontSize = 0; nest = 0; nTinyChars = 0; if (!rawOrder) { for (rot = 0; rot < 4; ++rot) { pools[rot] = new TextPool(); } } flows = NULL; blocks = NULL; rawWords = NULL; rawLastWord = NULL; fonts = new GList(); lastFindXMin = lastFindYMin = 0; haveLastFind = gFalse; } TextPage::~TextPage() { int rot; clear(); if (!rawOrder) { for (rot = 0; rot < 4; ++rot) { delete pools[rot]; } } delete fonts; } void TextPage::startPage(GfxState *state) { clear(); if (state) { pageWidth = state->getPageWidth(); pageHeight = state->getPageHeight(); } else { pageWidth = pageHeight = 0; } } void TextPage::clear() { int rot; TextFlow *flow; TextWord *word; if (curWord) { delete curWord; curWord = NULL; } if (rawOrder) { while (rawWords) { word = rawWords; rawWords = rawWords->next; delete word; } } else { for (rot = 0; rot < 4; ++rot) { delete pools[rot]; } while (flows) { flow = flows; flows = flows->next; delete flow; } gfree(blocks); } deleteGList(fonts, TextFontInfo); curWord = NULL; charPos = 0; curFont = NULL; curFontSize = 0; nest = 0; nTinyChars = 0; if (!rawOrder) { for (rot = 0; rot < 4; ++rot) { pools[rot] = new TextPool(); } } flows = NULL; blocks = NULL; rawWords = NULL; rawLastWord = NULL; fonts = new GList(); } void TextPage::updateFont(GfxState *state) { GfxFont *gfxFont; double *fm; char *name; int code, mCode, letterCode, anyCode; double w; int i; // get the font info object curFont = NULL; for (i = 0; i < fonts->getLength(); ++i) { curFont = (TextFontInfo *)fonts->get(i); if (curFont->matches(state)) { break; } curFont = NULL; } if (!curFont) { curFont = new TextFontInfo(state); fonts->append(curFont); } // adjust the font size gfxFont = state->getFont(); curFontSize = state->getTransformedFontSize(); if (gfxFont && gfxFont->getType() == fontType3) { // This is a hack which makes it possible to deal with some Type 3 // fonts. The problem is that it's impossible to know what the // base coordinate system used in the font is without actually // rendering the font. This code tries to guess by looking at the // width of the character 'm' (which breaks if the font is a // subset that doesn't contain 'm'). mCode = letterCode = anyCode = -1; for (code = 0; code < 256; ++code) { name = ((Gfx8BitFont *)gfxFont)->getCharName(code); if (name && name[0] == 'm' && name[1] == '\0') { mCode = code; } if (letterCode < 0 && name && name[1] == '\0' && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) { letterCode = code; } if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) { anyCode = code; } } if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { // 0.6 is a generic average 'm' width -- yes, this is a hack curFontSize *= w / 0.6; } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { // even more of a hack: 0.5 is a generic letter width curFontSize *= w / 0.5; } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { // better than nothing: 0.5 is a generic character width curFontSize *= w / 0.5; } fm = gfxFont->getFontMatrix(); if (fm[0] != 0) { curFontSize *= fabs(fm[3] / fm[0]); } } } void TextPage::beginWord(GfxState *state, double x0, double y0) { double *txtm, *ctm, *fontm; double m[4], m2[4]; int rot; // This check is needed because Type 3 characters can contain // text-drawing operations (when TextPage is being used via // XOutputDev rather than TextOutputDev). if (curWord) { ++nest; return; } // compute the rotation txtm = state->getTextMat(); ctm = state->getCTM(); m[0] = txtm[0] * ctm[0] + txtm[1] * ctm[2]; m[1] = txtm[0] * ctm[1] + txtm[1] * ctm[3]; m[2] = txtm[2] * ctm[0] + txtm[3] * ctm[2]; m[3] = txtm[2] * ctm[1] + txtm[3] * ctm[3]; if (state->getFont()->getType() == fontType3) { fontm = state->getFont()->getFontMatrix(); m2[0] = fontm[0] * m[0] + fontm[1] * m[2]; m2[1] = fontm[0] * m[1] + fontm[1] * m[3]; m2[2] = fontm[2] * m[0] + fontm[3] * m[2]; m2[3] = fontm[2] * m[1] + fontm[3] * m[3]; m[0] = m2[0]; m[1] = m2[1]; m[2] = m2[2]; m[3] = m2[3]; } if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) { rot = (m[3] < 0) ? 0 : 2; } else { rot = (m[2] > 0) ? 1 : 3; } curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize); } void TextPage::addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, Unicode *u, int uLen) { double x1, y1, w1, h1, dx2, dy2, sp; int n, i; // if the previous char was a space, addChar will have called // endWord, so we need to start a new word if (!curWord) { beginWord(state, x, y); } // throw away chars that aren't inside the page bounds state->transform(x, y, &x1, &y1); if (x1 < 0 || x1 > pageWidth || y1 < 0 || y1 > pageHeight) { return; } // subtract char and word spacing from the dx,dy values sp = state->getCharSpace(); if (c == (CharCode)0x20) { sp += state->getWordSpace(); } state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); dx -= dx2; dy -= dy2; state->transformDelta(dx, dy, &w1, &h1); // check the tiny chars limit if (!globalParams->getTextKeepTinyChars() && fabs(w1) < 3 && fabs(h1) < 3) { if (++nTinyChars > 50000) { return; } } // break words at space character if (uLen == 1 && u[0] == (Unicode)0x20) { ++curWord->charLen; ++charPos; endWord(); return; } // large char spacing is sometimes used to move text around -- in // this case, break text into individual chars and let the coalesce // function deal with it later n = curWord->len; if (n > 0) { switch (curWord->rot) { case 0: sp = x1 - curWord->xMax; break; case 1: sp = y1 - curWord->yMax; break; case 2: sp = curWord->xMin - x1; break; case 3: sp = curWord->yMin - y1; break; } if (sp > defaultSpaceWidth * curWord->fontSize) { endWord(); beginWord(state, x, y); } } // page rotation and/or transform matrices can cause text to be // drawn in reverse order -- in this case, swap the begin/end // coordinates and break text into individual chars if ((curWord->rot == 0 && w1 < 0) || (curWord->rot == 1 && h1 < 0) || (curWord->rot == 2 && w1 > 0) || (curWord->rot == 3 && h1 > 0)) { endWord(); beginWord(state, x + dx, y + dy); x1 += w1; y1 += h1; w1 = -w1; h1 = -h1; } // add the characters to the current word if (uLen != 0) { w1 /= uLen; h1 /= uLen; } for (i = 0; i < uLen; ++i) { curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); } ++curWord->charLen; ++charPos; } void TextPage::endWord() { // This check is needed because Type 3 characters can contain // text-drawing operations (when TextPage is being used via // XOutputDev rather than TextOutputDev). if (nest > 0) { --nest; return; } if (curWord) { addWord(curWord); curWord = NULL; } } void TextPage::addWord(TextWord *word) { // throw away zero-length words -- they don't have valid xMin/xMax // values, and they're useless anyway if (word->len == 0) { delete word; return; } if (rawOrder) { if (rawLastWord) { rawLastWord->next = word; } else { rawWords = word; } rawLastWord = word; } else { pools[word->rot]->addWord(word); } } void TextPage::coalesce(GBool physLayout) { UnicodeMap *uMap; TextPool *pool; TextWord *word0, *word1, *word2; TextLine *line; TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1; TextBlock **blkArray; TextFlow *flow, *lastFlow; int rot, poolMinBaseIdx, baseIdx, startBaseIdx; double minBase, maxBase, newMinBase, newMaxBase; double fontSize, colSpace, lineSpace, intraLineSpace, blkSpace; GBool found; int count[4]; int lrCount; int firstBlkIdx, nBlocksLeft; int col1, col2; int i, j, n; if (rawOrder) { primaryRot = 0; primaryLR = gTrue; return; } uMap = globalParams->getTextEncoding(); blkList = NULL; lastBlk = NULL; nBlocks = 0; primaryRot = -1; #if 0 // for debugging printf("*** initial words ***\n"); for (rot = 0; rot < 4; ++rot) { pool = pools[rot]; for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) { for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } } printf("\n"); #endif //----- assemble the blocks //~ add an outer loop for writing mode (vertical text) // build blocks for each rotation value for (rot = 0; rot < 4; ++rot) { pool = pools[rot]; poolMinBaseIdx = pool->minBaseIdx; count[rot] = 0; // add blocks until no more words are left while (1) { // find the first non-empty line in the pool for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx) ; if (poolMinBaseIdx > pool->maxBaseIdx) { break; } // look for the left-most word in the first four lines of the // pool -- this avoids starting with a superscript word startBaseIdx = poolMinBaseIdx; for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) { if (!pool->getPool(baseIdx)) { continue; } if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) { startBaseIdx = baseIdx; } } // create a new block word0 = pool->getPool(startBaseIdx); pool->setPool(startBaseIdx, word0->next); word0->next = NULL; blk = new TextBlock(this, rot); blk->addWord(word0); fontSize = word0->fontSize; minBase = maxBase = word0->base; colSpace = minColSpacing * fontSize; lineSpace = maxLineSpacingDelta * fontSize; intraLineSpace = maxIntraLineDelta * fontSize; // add words to the block do { found = gFalse; // look for words on the line above the current top edge of // the block newMinBase = minBase; for (baseIdx = pool->getBaseIdx(minBase); baseIdx >= pool->getBaseIdx(minBase - lineSpace); --baseIdx) { word0 = NULL; word1 = pool->getPool(baseIdx); while (word1) { if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) { word2 = word1; if (word0) { word0->next = word1->next; } else { pool->setPool(baseIdx, word1->next); } word1 = word1->next; word2->next = NULL; blk->addWord(word2); found = gTrue; newMinBase = word2->base; } else { word0 = word1; word1 = word1->next; } } } minBase = newMinBase; // look for words on the line below the current bottom edge of // the block newMaxBase = maxBase; for (baseIdx = pool->getBaseIdx(maxBase); baseIdx <= pool->getBaseIdx(maxBase + lineSpace); ++baseIdx) { word0 = NULL; word1 = pool->getPool(baseIdx); while (word1) { if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) { word2 = word1; if (word0) { word0->next = word1->next; } else { pool->setPool(baseIdx, word1->next); } word1 = word1->next; word2->next = NULL; blk->addWord(word2); found = gTrue; newMaxBase = word2->base; } else { word0 = word1; word1 = word1->next; } } } maxBase = newMaxBase; // look for words that are on lines already in the block, and // that overlap the block horizontally for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) { word0 = NULL; word1 = pool->getPool(baseIdx); while (word1) { if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta2 * fontSize) { word2 = word1; if (word0) { word0->next = word1->next; } else { pool->setPool(baseIdx, word1->next); } word1 = word1->next; word2->next = NULL; blk->addWord(word2); found = gTrue; } else { word0 = word1; word1 = word1->next; } } } // only check for outlying words (the next two chunks of code) // if we didn't find anything else if (found) { continue; } // scan down the left side of the block, looking for words // that are near (but not overlapping) the block; if there are // three or fewer, add them to the block n = 0; for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) { word1 = pool->getPool(baseIdx); while (word1) { if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace)) && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { ++n; break; } word1 = word1->next; } } if (n > 0 && n <= 3) { for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) { word0 = NULL; word1 = pool->getPool(baseIdx); while (word1) { if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace)) && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { word2 = word1; if (word0) { word0->next = word1->next; } else { pool->setPool(baseIdx, word1->next); } word1 = word1->next; word2->next = NULL; blk->addWord(word2); if (word2->base < minBase) { minBase = word2->base; } else if (word2->base > maxBase) { maxBase = word2->base; } found = gTrue; break; } else { word0 = word1; word1 = word1->next; } } } } // scan down the right side of the block, looking for words // that are near (but not overlapping) the block; if there are // three or fewer, add them to the block n = 0; for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) { word1 = pool->getPool(baseIdx); while (word1) { if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace)) && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { ++n; break; } word1 = word1->next; } } if (n > 0 && n <= 3) { for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) { word0 = NULL; word1 = pool->getPool(baseIdx); while (word1) { if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace)) && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) { word2 = word1; if (word0) { word0->next = word1->next; } else { pool->setPool(baseIdx, word1->next); } word1 = word1->next; word2->next = NULL; blk->addWord(word2); if (word2->base < minBase) { minBase = word2->base; } else if (word2->base > maxBase) { maxBase = word2->base; } found = gTrue; break; } else { word0 = word1; word1 = word1->next; } } } } } while (found); //~ need to compute the primary writing mode (horiz/vert) in //~ addition to primary rotation // coalesce the block, and add it to the list blk->coalesce(uMap); if (lastBlk) { lastBlk->next = blk; } else { blkList = blk; } lastBlk = blk; count[rot] += blk->charCount; if (primaryRot < 0 || count[rot] > count[primaryRot]) { primaryRot = rot; } ++nBlocks; } } #if 0 // for debugging printf("*** rotation ***\n"); for (rot = 0; rot < 4; ++rot) { printf(" %d: %6d\n", rot, count[rot]); } printf(" primary rot = %d\n", primaryRot); printf("\n"); #endif #if 0 // for debugging printf("*** blocks ***\n"); for (blk = blkList; blk; blk = blk->next) { printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n", blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax); for (line = blk->lines; line; line = line->next) { printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n", line->xMin, line->xMax, line->yMin, line->yMax, line->base); for (word0 = line->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } } printf("\n"); #endif // determine the primary direction lrCount = 0; for (blk = blkList; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { for (word0 = line->words; word0; word0 = word0->next) { for (i = 0; i < word0->len; ++i) { if (unicodeTypeL(word0->text[i])) { ++lrCount; } else if (unicodeTypeR(word0->text[i])) { --lrCount; } } } } } primaryLR = lrCount >= 0; #if 0 // for debugging printf("*** direction ***\n"); printf("lrCount = %d\n", lrCount); printf("primaryLR = %d\n", primaryLR); #endif //----- column assignment // sort blocks into xy order for column assignment blocks = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *)); for (blk = blkList, i = 0; blk; blk = blk->next, ++i) { blocks[i] = blk; } qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot); // column assignment for (i = 0; i < nBlocks; ++i) { blk0 = blocks[i]; col1 = 0; for (j = 0; j < i; ++j) { blk1 = blocks[j]; col2 = 0; // make gcc happy switch (primaryRot) { case 0: if (blk0->xMin > blk1->xMax) { col2 = blk1->col + blk1->nColumns + 3; } else { col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns); } break; case 1: if (blk0->yMin > blk1->yMax) { col2 = blk1->col + blk1->nColumns + 3; } else { col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns); } break; case 2: if (blk0->xMax < blk1->xMin) { col2 = blk1->col + blk1->nColumns + 3; } else { col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns); } break; case 3: if (blk0->yMax < blk1->yMin) { col2 = blk1->col + blk1->nColumns + 3; } else { col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns); } break; } if (col2 > col1) { col1 = col2; } } blk0->col = col1; for (line = blk0->lines; line; line = line->next) { for (j = 0; j <= line->len; ++j) { line->col[j] += col1; } } } #if 0 // for debugging printf("*** blocks, after column assignment ***\n"); for (blk = blkList; blk; blk = blk->next) { printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n", blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col, blk->nColumns); for (line = blk->lines; line; line = line->next) { printf(" line:\n"); for (word0 = line->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } } printf("\n"); #endif //----- reading order sort // sort blocks into yx order (in preparation for reading order sort) qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot); // compute space on left and right sides of each block for (i = 0; i < nBlocks; ++i) { blk0 = blocks[i]; for (j = 0; j < nBlocks; ++j) { blk1 = blocks[j]; if (blk1 != blk0) { blk0->updatePriMinMax(blk1); } } } #if 0 // for debugging printf("*** blocks, after yx sort ***\n"); for (i = 0; i < nBlocks; ++i) { blk = blocks[i]; printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n", blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->priMin, blk->priMax); for (line = blk->lines; line; line = line->next) { printf(" line:\n"); for (word0 = line->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); for (j = 0; j < word0->len; ++j) { fputc(word0->text[j] & 0xff, stdout); } printf("'\n"); } } } printf("\n"); #endif // build the flows //~ this needs to be adjusted for writing mode (vertical text) //~ this also needs to account for right-to-left column ordering blkArray = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *)); memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *)); flows = lastFlow = NULL; firstBlkIdx = 0; nBlocksLeft = nBlocks; while (nBlocksLeft > 0) { // find the upper-left-most block for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ; i = firstBlkIdx; blk = blkArray[i]; for (j = firstBlkIdx + 1; j < nBlocks; ++j) { blk1 = blkArray[j]; if (blk1) { if (blk && blk->secondaryDelta(blk1) > 0) { break; } if (blk1->primaryCmp(blk) < 0) { i = j; blk = blk1; } } } blkArray[i] = NULL; --nBlocksLeft; blk->next = NULL; // create a new flow, starting with the upper-left-most block flow = new TextFlow(this, blk); if (lastFlow) { lastFlow->next = flow; } else { flows = flow; } lastFlow = flow; fontSize = blk->lines->words->fontSize; // push the upper-left-most block on the stack blk->stackNext = NULL; blkStack = blk; // find the other blocks in this flow while (blkStack) { // find the upper-left-most block under (but within // maxBlockSpacing of) the top block on the stack blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize; blk = NULL; i = -1; for (j = firstBlkIdx; j < nBlocks; ++j) { blk1 = blkArray[j]; if (blk1) { if (blkStack->secondaryDelta(blk1) > blkSpace) { break; } if (blk && blk->secondaryDelta(blk1) > 0) { break; } if (blk1->isBelow(blkStack) && (!blk || blk1->primaryCmp(blk) < 0)) { i = j; blk = blk1; } } } // if a suitable block was found, add it to the flow and push it // onto the stack if (blk && flow->blockFits(blk, blkStack)) { blkArray[i] = NULL; --nBlocksLeft; blk->next = NULL; flow->addBlock(blk); fontSize = blk->lines->words->fontSize; blk->stackNext = blkStack; blkStack = blk; // otherwise (if there is no block under the top block or the // block is not suitable), pop the stack } else { blkStack = blkStack->stackNext; } } } gfree(blkArray); #if 0 // for debugging printf("*** flows ***\n"); for (flow = flows; flow; flow = flow->next) { printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n", flow->xMin, flow->xMax, flow->yMin, flow->yMax, flow->priMin, flow->priMax); for (blk = flow->blocks; blk; blk = blk->next) { printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n", blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->priMin, blk->priMax); for (line = blk->lines; line; line = line->next) { printf(" line:\n"); for (word0 = line->words; word0; word0 = word0->next) { printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '", word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->base, word0->fontSize, word0->spaceAfter); for (i = 0; i < word0->len; ++i) { fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } } } printf("\n"); #endif if (uMap) { uMap->decRefCnt(); } } GBool TextPage::findText(Unicode *s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax) { TextBlock *blk; TextLine *line; Unicode *p; Unicode u1, u2; int m, i, j, k; double xStart, yStart, xStop, yStop; double xMin0, yMin0, xMax0, yMax0; double xMin1, yMin1, xMax1, yMax1; GBool found; //~ needs to handle right-to-left text if (rawOrder) { return gFalse; } xStart = yStart = xStop = yStop = 0; if (startAtLast && haveLastFind) { xStart = lastFindXMin; yStart = lastFindYMin; } else if (!startAtTop) { xStart = *xMin; yStart = *yMin; } if (stopAtLast && haveLastFind) { xStop = lastFindXMin; yStop = lastFindYMin; } else if (!stopAtBottom) { xStop = *xMax; yStop = *yMax; } found = gFalse; xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy for (i = 0; i < nBlocks; ++i) { blk = blocks[i]; // check: is the block above the top limit? if (!startAtTop && blk->yMax < yStart) { continue; } // check: is the block below the bottom limit? if (!stopAtBottom && blk->yMin > yStop) { break; } for (line = blk->lines; line; line = line->next) { // check: is the line above the top limit? if (!startAtTop && line->yMin < yStart) { continue; } // check: is the line below the bottom limit? if (!stopAtBottom && line->yMin > yStop) { continue; } // search each position in this line m = line->len; for (j = 0, p = line->text; j <= m - len; ++j, ++p) { // compare the strings for (k = 0; k < len; ++k) { #if 1 //~ this lowercases Latin A-Z only -- this will eventually be //~ extended to handle other character sets if (p[k] >= 0x41 && p[k] <= 0x5a) { u1 = p[k] + 0x20; } else { u1 = p[k]; } if (s[k] >= 0x41 && s[k] <= 0x5a) { u2 = s[k] + 0x20; } else { u2 = s[k]; } #endif if (u1 != u2) { break; } } // found it if (k == len) { switch (line->rot) { case 0: xMin1 = line->edge[j]; xMax1 = line->edge[j + len]; yMin1 = line->yMin; yMax1 = line->yMax; break; case 1: xMin1 = line->xMin; xMax1 = line->xMax; yMin1 = line->edge[j]; yMax1 = line->edge[j + len]; break; case 2: xMin1 = line->edge[j + len]; xMax1 = line->edge[j]; yMin1 = line->yMin; yMax1 = line->yMax; break; case 3: xMin1 = line->xMin; xMax1 = line->xMax; yMin1 = line->edge[j + len]; yMax1 = line->edge[j]; break; } if ((startAtTop || yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && (stopAtBottom || yMin1 < yStop || (yMin1 == yStop && xMin1 < yStop))) { if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) { xMin0 = xMin1; xMax0 = xMax1; yMin0 = yMin1; yMax0 = yMax1; found = gTrue; } } } } } } if (found) { *xMin = xMin0; *xMax = xMax0; *yMin = yMin0; *yMax = yMax0; lastFindXMin = xMin0; lastFindYMin = yMin0; haveLastFind = gTrue; return gTrue; } return gFalse; } GString *TextPage::getText(double xMin, double yMin, double xMax, double yMax) { GString *s; UnicodeMap *uMap; GBool isUnicode; TextBlock *blk; TextLine *line; TextLineFrag *frags; int nFrags, fragsSize; TextLineFrag *frag; char space[8], eol[16]; int spaceLen, eolLen; int lastRot; double x, y; int col, idx0, idx1, i, j; GBool multiLine, oneRot; s = new GString(); if (rawOrder) { return s; } // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return s; } isUnicode = uMap->isUnicode(); spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = 0; // make gcc happy switch (globalParams->getTextEOL()) { case eolUnix: eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); break; case eolDOS: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); break; case eolMac: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); break; } //~ writing mode (horiz/vert) // collect the line fragments that are in the rectangle fragsSize = 256; frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag)); nFrags = 0; lastRot = -1; oneRot = gTrue; for (i = 0; i < nBlocks; ++i) { blk = blocks[i]; if (xMin < blk->xMax && blk->xMin < xMax && yMin < blk->yMax && blk->yMin < yMax) { for (line = blk->lines; line; line = line->next) { if (xMin < line->xMax && line->xMin < xMax && yMin < line->yMax && line->yMin < yMax) { idx0 = idx1 = -1; switch (line->rot) { case 0: y = 0.5 * (line->yMin + line->yMax); if (yMin < y && y < yMax) { j = 0; while (j < line->len) { if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) { idx0 = j; break; } ++j; } j = line->len - 1; while (j >= 0) { if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) { idx1 = j; break; } --j; } } break; case 1: x = 0.5 * (line->xMin + line->xMax); if (xMin < x && x < xMax) { j = 0; while (j < line->len) { if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) { idx0 = j; break; } ++j; } j = line->len - 1; while (j >= 0) { if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) { idx1 = j; break; } --j; } } break; case 2: y = 0.5 * (line->yMin + line->yMax); if (yMin < y && y < yMax) { j = 0; while (j < line->len) { if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) { idx0 = j; break; } ++j; } j = line->len - 1; while (j >= 0) { if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) { idx1 = j; break; } --j; } } break; case 3: x = 0.5 * (line->xMin + line->xMax); if (xMin < x && x < xMax) { j = 0; while (j < line->len) { if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) { idx0 = j; break; } ++j; } j = line->len - 1; while (j >= 0) { if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) { idx1 = j; break; } --j; } } break; } if (idx0 >= 0 && idx1 >= 0) { if (nFrags == fragsSize) { fragsSize *= 2; frags = (TextLineFrag *) grealloc(frags, fragsSize * sizeof(TextLineFrag)); } frags[nFrags].init(line, idx0, idx1 - idx0 + 1); ++nFrags; if (lastRot >= 0 && line->rot != lastRot) { oneRot = gFalse; } lastRot = line->rot; } } } } } // sort the fragments and generate the string if (nFrags > 0) { for (i = 0; i < nFrags; ++i) { frags[i].computeCoords(oneRot); } assignColumns(frags, nFrags, oneRot); // if all lines in the region have the same rotation, use it; // otherwise, use the page's primary rotation if (oneRot) { qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXLineRot); } else { qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot); } col = 0; multiLine = gFalse; for (i = 0; i < nFrags; ++i) { frag = &frags[i]; // insert a return if (frag->col < col || (i > 0 && fabs(frag->base - frags[i-1].base) > maxIntraLineDelta * frags[i-1].line->words->fontSize)) { s->append(eol, eolLen); col = 0; multiLine = gTrue; } // column alignment for (; col < frag->col; ++col) { s->append(space, spaceLen); } // get the fragment text col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); } if (multiLine) { s->append(eol, eolLen); } } gfree(frags); uMap->decRefCnt(); return s; } GBool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) { TextBlock *blk; TextLine *line; TextWord *word; double xMin0, xMax0, yMin0, yMax0; double xMin1, xMax1, yMin1, yMax1; GBool first; int i, j0, j1; if (rawOrder) { return gFalse; } //~ this doesn't correctly handle: //~ - ranges split across multiple lines (the highlighted region //~ is the bounding box of all the parts of the range) //~ - cases where characters don't convert one-to-one into Unicode first = gTrue; xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy for (i = 0; i < nBlocks; ++i) { blk = blocks[i]; for (line = blk->lines; line; line = line->next) { for (word = line->words; word; word = word->next) { if (pos < word->charPos + word->charLen && word->charPos < pos + length) { j0 = pos - word->charPos; if (j0 < 0) { j0 = 0; } j1 = pos + length - 1 - word->charPos; if (j1 >= word->len) { j1 = word->len - 1; } switch (line->rot) { case 0: xMin1 = word->edge[j0]; xMax1 = word->edge[j1 + 1]; yMin1 = word->yMin; yMax1 = word->yMax; break; case 1: xMin1 = word->xMin; xMax1 = word->xMax; yMin1 = word->edge[j0]; yMax1 = word->edge[j1 + 1]; break; case 2: xMin1 = word->edge[j1 + 1]; xMax1 = word->edge[j0]; yMin1 = word->yMin; yMax1 = word->yMax; break; case 3: xMin1 = word->xMin; xMax1 = word->xMax; yMin1 = word->edge[j1 + 1]; yMax1 = word->edge[j0]; break; } if (first || xMin1 < xMin0) { xMin0 = xMin1; } if (first || xMax1 > xMax0) { xMax0 = xMax1; } if (first || yMin1 < yMin0) { yMin0 = yMin1; } if (first || yMax1 > yMax0) { yMax0 = yMax1; } first = gFalse; } } } } if (!first) { *xMin = xMin0; *xMax = xMax0; *yMin = yMin0; *yMax = yMax0; return gTrue; } return gFalse; } void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, GBool physLayout) { UnicodeMap *uMap; TextFlow *flow; TextBlock *blk; TextLine *line; TextLineFrag *frags; TextWord *word; int nFrags, fragsSize; TextLineFrag *frag; char space[8], eol[16], eop[8]; int spaceLen, eolLen, eopLen; GBool pageBreaks; GString *s; int col, i, d, n; // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { return; } spaceLen = uMap->mapUnicode(0x20, space, sizeof(space)); eolLen = 0; // make gcc happy switch (globalParams->getTextEOL()) { case eolUnix: eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol)); break; case eolDOS: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen); break; case eolMac: eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol)); break; } eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); pageBreaks = globalParams->getTextPageBreaks(); //~ writing mode (horiz/vert) // output the page in raw (content stream) order if (rawOrder) { for (word = rawWords; word; word = word->next) { s = new GString(); dumpFragment(word->text, word->len, uMap, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; if (word->next && fabs(word->next->base - word->base) < maxIntraLineDelta * word->fontSize) { if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) { (*outputFunc)(outputStream, space, spaceLen); } } else { (*outputFunc)(outputStream, eol, eolLen); } } // output the page, maintaining the original physical layout } else if (physLayout) { // collect the line fragments for the page and sort them fragsSize = 256; frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag)); nFrags = 0; for (i = 0; i < nBlocks; ++i) { blk = blocks[i]; for (line = blk->lines; line; line = line->next) { if (nFrags == fragsSize) { fragsSize *= 2; frags = (TextLineFrag *)grealloc(frags, fragsSize * sizeof(TextLineFrag)); } frags[nFrags].init(line, 0, line->len); frags[nFrags].computeCoords(gTrue); ++nFrags; } } qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot); // generate output col = 0; for (i = 0; i < nFrags; ++i) { frag = &frags[i]; // column alignment for (; col < frag->col; ++col) { (*outputFunc)(outputStream, space, spaceLen); } // print the line s = new GString(); col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; // print one or more returns if necessary if (i == nFrags - 1 || frags[i+1].col < col || fabs(frags[i+1].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) { if (i < nFrags - 1) { d = (int)((frags[i+1].base - frag->base) / frag->line->words->fontSize); if (d < 1) { d = 1; } else if (d > 5) { d = 5; } } else { d = 1; } for (; d > 0; --d) { (*outputFunc)(outputStream, eol, eolLen); } col = 0; } } gfree(frags); // output the page, "undoing" the layout } else { for (flow = flows; flow; flow = flow->next) { for (blk = flow->blocks; blk; blk = blk->next) { for (line = blk->lines; line; line = line->next) { n = line->len; if (line->hyphenated && (line->next || blk->next)) { --n; } s = new GString(); dumpFragment(line->text, n, uMap, s); (*outputFunc)(outputStream, s->getCString(), s->getLength()); delete s; if (!line->hyphenated) { if (line->next) { (*outputFunc)(outputStream, space, spaceLen); } else if (blk->next) { //~ this is a bit of a kludge - we should really do a more //~ intelligent determination of paragraphs if (blk->next->lines->words->fontSize == blk->lines->words->fontSize) { (*outputFunc)(outputStream, space, spaceLen); } else { (*outputFunc)(outputStream, eol, eolLen); } } } } } (*outputFunc)(outputStream, eol, eolLen); (*outputFunc)(outputStream, eol, eolLen); } } // end of page if (pageBreaks) { (*outputFunc)(outputStream, eop, eopLen); (*outputFunc)(outputStream, eol, eolLen); } uMap->decRefCnt(); } void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) { TextLineFrag *frag0, *frag1; int rot, col1, col2, i, j, k; // all text in the region has the same rotation -- recompute the // column numbers based only on the text in the region if (oneRot) { qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot); rot = frags[0].line->rot; for (i = 0; i < nFrags; ++i) { frag0 = &frags[i]; col1 = 0; for (j = 0; j < i; ++j) { frag1 = &frags[j]; col2 = 0; // make gcc happy switch (rot) { case 0: if (frag0->xMin >= frag1->xMax) { col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; } else { for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k+1]); ++k) ; col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; } break; case 1: if (frag0->yMin >= frag1->yMax) { col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; } else { for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k+1]); ++k) ; col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; } break; case 2: if (frag0->xMax <= frag1->xMin) { col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; } else { for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k+1]); ++k) ; col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; } break; case 3: if (frag0->yMax <= frag1->yMin) { col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1; } else { for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k+1]); ++k) ; col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start]; } break; } if (col2 > col1) { col1 = col2; } } frag0->col = col1; } // the region includes text at different rotations -- use the // globally assigned column numbers, offset by the minimum column // number (i.e., shift everything over to column 0) } else { col1 = frags[0].col; for (i = 1; i < nFrags; ++i) { if (frags[i].col < col1) { col1 = frags[i].col; } } for (i = 0; i < nFrags; ++i) { frags[i].col -= col1; } } } int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s) { char lre[8], rle[8], popdf[8], buf[8]; int lreLen, rleLen, popdfLen, n; int nCols, i, j, k; nCols = 0; if (uMap->isUnicode()) { lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle)); popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf)); if (primaryLR) { i = 0; while (i < len) { // output a left-to-right section for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ; for (k = i; k < j; ++k) { n = uMap->mapUnicode(text[k], buf, sizeof(buf)); s->append(buf, n); ++nCols; } i = j; // output a right-to-left section for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ; if (j > i) { s->append(rle, rleLen); for (k = j - 1; k >= i; --k) { n = uMap->mapUnicode(text[k], buf, sizeof(buf)); s->append(buf, n); ++nCols; } s->append(popdf, popdfLen); i = j; } } } else { s->append(rle, rleLen); i = len - 1; while (i >= 0) { // output a right-to-left section for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ; for (k = i; k > j; --k) { n = uMap->mapUnicode(text[k], buf, sizeof(buf)); s->append(buf, n); ++nCols; } i = j; // output a left-to-right section for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ; if (j < i) { s->append(lre, lreLen); for (k = j + 1; k <= i; ++k) { n = uMap->mapUnicode(text[k], buf, sizeof(buf)); s->append(buf, n); ++nCols; } s->append(popdf, popdfLen); i = j; } } s->append(popdf, popdfLen); } } else { for (i = 0; i < len; ++i) { n = uMap->mapUnicode(text[i], buf, sizeof(buf)); s->append(buf, n); nCols += n; } } return nCols; } #if TEXTOUT_WORD_LIST TextWordList *TextPage::makeWordList(GBool physLayout) { return new TextWordList(this, physLayout); } #endif //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ static void outputToFile(void *stream, char *text, int len) { fwrite(text, 1, len, (FILE *)stream); } TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, GBool rawOrderA, GBool append) { text = NULL; physLayout = physLayoutA; rawOrder = rawOrderA; ok = gTrue; // open file needClose = gFalse; if (fileName) { if (!strcmp(fileName, "-")) { outputStream = stdout; #ifdef WIN32 // keep DOS from munging the end-of-line characters setmode(fileno(stdout), O_BINARY); #endif } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) { needClose = gTrue; } else { error(-1, "Couldn't open text file '%s'", fileName); ok = gFalse; return; } outputFunc = &outputToFile; } else { outputStream = NULL; } // set up text object text = new TextPage(rawOrderA); } TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, GBool physLayoutA, GBool rawOrderA) { outputFunc = func; outputStream = stream; needClose = gFalse; physLayout = physLayoutA; rawOrder = rawOrderA; text = new TextPage(rawOrderA); ok = gTrue; } TextOutputDev::~TextOutputDev() { if (needClose) { #ifdef MACOS ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle); #endif fclose((FILE *)outputStream); } if (text) { delete text; } } void TextOutputDev::startPage(int pageNum, GfxState *state) { text->startPage(state); } void TextOutputDev::endPage() { text->coalesce(physLayout); if (outputStream) { text->dump(outputStream, outputFunc, physLayout); } } void TextOutputDev::updateFont(GfxState *state) { text->updateFont(state); } void TextOutputDev::beginString(GfxState *state, GString *s) { text->beginWord(state, state->getCurX(), state->getCurY()); } void TextOutputDev::endString(GfxState *state) { text->endWord(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, Unicode *u, int uLen) { text->addChar(state, x, y, dx, dy, c, u, uLen); } GBool TextOutputDev::findText(Unicode *s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax) { return text->findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, xMin, yMin, xMax, yMax); } GString *TextOutputDev::getText(double xMin, double yMin, double xMax, double yMax) { return text->getText(xMin, yMin, xMax, yMax); } GBool TextOutputDev::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) { return text->findCharRange(pos, length, xMin, yMin, xMax, yMax); } #if TEXTOUT_WORD_LIST TextWordList *TextOutputDev::makeWordList() { return text->makeWordList(physLayout); } #endif