Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/pdf/xpdf/TextOutputDev.cc
diff options
context:
space:
mode:
authorMartin Kretzschmar <mkretzschmar@src.gnome.org>2004-05-16 22:45:42 (GMT)
committer Martin Kretzschmar <mkretzschmar@src.gnome.org>2004-05-16 22:45:42 (GMT)
commitad63666daeeda50acc7630132d61fe044634fddd (patch)
tree487b52b30f8a563cbb2e16c6fd2527a6eeb5990f /pdf/xpdf/TextOutputDev.cc
parentd57c02ebc09bfd1a0cac44140ec7a80dbe43877e (diff)
Imported Xpdf 2.03 and fixed build.
* ANNOUNCE: * CHANGES: * README: * aconf2.h: * configure.in: * dj_make.bat: * doc/pdffonts.1: * doc/pdffonts.cat: * doc/pdffonts.hlp: * doc/pdfimages.1: * doc/pdfimages.cat: * doc/pdfimages.hlp: * doc/pdfinfo.1: * doc/pdfinfo.cat: * doc/pdfinfo.hlp: * doc/pdftopbm.1: * doc/pdftopbm.cat: * doc/pdftopbm.hlp: * doc/pdftops.1: * doc/pdftops.cat: * doc/pdftops.hlp: * doc/pdftotext.1: * doc/pdftotext.cat: * doc/pdftotext.hlp: * doc/xpdf.1: * doc/xpdf.cat: * doc/xpdf.hlp: * doc/xpdfrc.5: * doc/xpdfrc.cat: * doc/xpdfrc.hlp: * goo/gfile.cc: * ms_make.bat: * vms_make.com: * xpdf/Annot.cc: * xpdf/Array.cc: * xpdf/BuiltinFontTables.cc: * xpdf/CMap.cc: * xpdf/CMap.h: * xpdf/Catalog.cc: * xpdf/CharCodeToUnicode.cc: * xpdf/CharCodeToUnicode.h: * xpdf/Decrypt.cc: * xpdf/Dict.cc: * xpdf/ErrorCodes.h: * xpdf/FTFont.cc: * xpdf/FTFont.h: * xpdf/FontFile.cc: * xpdf/FontFile.h: * xpdf/Function.cc: * xpdf/Gfx.cc: * xpdf/Gfx.h: * xpdf/GfxFont.cc: * xpdf/GfxFont.h: * xpdf/GfxState.cc: * xpdf/GfxState.h: * xpdf/GlobalParams.cc: * xpdf/GlobalParams.h: * xpdf/JBIG2Stream.cc: * xpdf/Link.cc: * xpdf/Link.h: * xpdf/Makefile.am: * xpdf/OutputDev.h: * xpdf/PDFDoc.cc: * xpdf/PDFDoc.h: * xpdf/PSOutputDev.cc: * xpdf/PSOutputDev.h: * xpdf/Page.cc: * xpdf/Page.h: * xpdf/Parser.cc: * xpdf/Stream.cc: * xpdf/Stream.h: * xpdf/TTFont.cc: * xpdf/TTFont.h: * xpdf/TextOutputDev.cc: * xpdf/TextOutputDev.h: * xpdf/UnicodeMap.cc: * xpdf/UnicodeMap.h: * xpdf/UnicodeTypeTable.cc: * xpdf/UnicodeTypeTable.h: * xpdf/XOutputDev.cc: * xpdf/XOutputDev.h: * xpdf/XPDFApp.cc: * xpdf/XPDFCore.cc: * xpdf/XPDFCore.h: * xpdf/XPDFViewer.cc: * xpdf/XPDFViewer.h: * xpdf/XRef.cc: * xpdf/about-text.h: * xpdf/config.h: * xpdf/gpdf-control.cc: * xpdf/gpdf-link-canvas-item.cc: * xpdf/gpdf-links-canvas-layer.cc: * xpdf/pdffonts.cc: * xpdf/pdfimages.cc: * xpdf/pdfinfo.cc: * xpdf/pdftopbm.cc: * xpdf/pdftops.cc: * xpdf/pdftotext.cc: * xpdf/tests/test-links.cc: * xpdf/vms_make.com: * xpdf/xpdf.cc: Imported Xpdf 2.03 and fixed build.
Diffstat (limited to 'pdf/xpdf/TextOutputDev.cc')
-rw-r--r--pdf/xpdf/TextOutputDev.cc4136
1 files changed, 2725 insertions, 1411 deletions
diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc
index 4e9a63a..aeee59c 100644
--- a/pdf/xpdf/TextOutputDev.cc
+++ b/pdf/xpdf/TextOutputDev.cc
@@ -28,6 +28,7 @@
#include "Error.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
+#include "UnicodeTypeTable.h"
#include "GfxState.h"
#include "TextOutputDev.h"
@@ -40,176 +41,187 @@
// parameters
//------------------------------------------------------------------------
-// Minium and maximum inter-word spacing (as a fraction of the average
-// character width).
-#define wordMinSpaceWidth 0.3
-#define wordMaxSpaceWidth 2.0
-
-// Default min and max inter-word spacing (when the average character
-// width is unknown).
-#define wordDefMinSpaceWidth 0.2
-#define wordDefMaxSpaceWidth 1.5
-
-// Max difference in x,y coordinates (as a fraction of the font size)
-// allowed for duplicated text (fake boldface, drop shadows) which is
-// to be discarded.
-#define dupMaxDeltaX 0.1
-#define dupMaxDeltaY 0.2
-
-// Min overlap (as a fraction of the font size) required for two
-// lines to be considered vertically overlapping.
-#define lineOverlapSlack 0.5
-
-// Max difference in baseline y coordinates (as a fraction of the font
-// size) allowed for words which are to be grouped into a line, not
-// including sub/superscripts.
-#define lineMaxBaselineDelta 0.1
-
-// Max ratio of font sizes allowed for words which are to be grouped
-// into a line, not including sub/superscripts.
-#define lineMaxFontSizeRatio 1.4
-
-// Min spacing (as a fraction of the font size) allowed between words
-// which are to be grouped into a line.
-#define lineMinDeltaX -0.5
-
-// Minimum vertical overlap (as a fraction of the font size) required
-// for superscript and subscript words.
-#define lineMinSuperscriptOverlap 0.3
-#define lineMinSubscriptOverlap 0.3
-
-// Min/max ratio of font sizes allowed for sub/superscripts compared to
-// the base text.
-#define lineMinSubscriptFontSizeRatio 0.4
-#define lineMaxSubscriptFontSizeRatio 1.01
-#define lineMinSuperscriptFontSizeRatio 0.4
-#define lineMaxSuperscriptFontSizeRatio 1.01
-
-// Max horizontal spacing (as a fraction of the font size) allowed
-// before sub/superscripts.
-#define lineMaxSubscriptDeltaX 0.2
-#define lineMaxSuperscriptDeltaX 0.2
-
-// Maximum vertical spacing (as a fraction of the font size) allowed
-// for lines which are to be grouped into a block.
-#define blkMaxSpacing 2.0
-
-// Max ratio of primary font sizes allowed for lines which are to be
-// grouped into a block.
-#define blkMaxFontSizeRatio 1.3
-
-// Min overlap (as a fraction of the font size) required for two
-// blocks to be considered vertically overlapping.
-#define blkOverlapSlack 0.5
-
-// Max vertical spacing (as a fraction of the font size) allowed
-// between blocks which are 'adjacent' when sorted by reading order.
-#define blkMaxSortSpacing 2.0
-
-// Max vertical offset (as a fraction of the font size) of the top and
-// bottom edges allowed for blocks which are to be grouped into a
-// flow.
-#define flowMaxDeltaY 1.0
+// Each bucket in a text pool includes baselines within a range of
+// this many points.
+#define textPoolStep 4
+
+// Inter-character space width which will cause addChar to break up a
+// text string.
+#define defaultSpaceWidth 0.25
+
+// Max distance between baselines of two lines within a block, as a
+// fraction of the font size.
+#define maxLineSpacingDelta 1.5
+
+// Max difference in primary font sizes on two lines in the same
+// block. Delta1 is used when examining new lines above and below the
+// current block; delta2 is used when examining text that overlaps the
+// current block; delta3 is used when examining text to the left and
+// right of the current block.
+#define maxBlockFontSizeDelta1 0.05
+#define maxBlockFontSizeDelta2 0.6
+#define maxBlockFontSizeDelta3 0.2
+
+// Max difference in font sizes inside a word.
+#define maxWordFontSizeDelta 0.05
+
+// Maximum distance between baselines of two words on the same line,
+// e.g., distance between subscript or superscript and the primary
+// baseline, as a fraction of the font size.
+#define maxIntraLineDelta 0.5
+
+// Minimum inter-word spacing, as a fraction of the font size. (Only
+// used for raw ordering.)
+#define minWordSpacing 0.2
+
+// Maximum inter-word spacing, as a fraction of the font size.
+#define maxWordSpacing 1.5
+
+// Minimum spacing between columns, as a fraction of the font size.
+#define minColSpacing 1.0
+
+// Maximum vertical spacing between blocks within a flow, as a
+// multiple of the font size.
+#define maxBlockSpacing 2.5
+
+// Minimum spacing between characters within a word, as a fraction of
+// the font size.
+#define minCharSpacing -0.2
+
+// Maximum spacing between characters within a word, as a fraction of
+// the font size, when there is no obvious extra-wide character
+// spacing.
+#define maxCharSpacing 0.03
+
+// When extra-wide character spacing is detected, the inter-character
+// space threshold is set to the minimum inter-character space
+// multiplied by this constant.
+#define maxWideCharSpacingMul 1.3
+
+// Max difference in primary,secondary coordinates (as a fraction of
+// the font size) allowed for duplicated text (fake boldface, drop
+// shadows) which is to be discarded.
+#define dupMaxPriDelta 0.1
+#define dupMaxSecDelta 0.2
//------------------------------------------------------------------------
// TextFontInfo
//------------------------------------------------------------------------
TextFontInfo::TextFontInfo(GfxState *state) {
- double *textMat;
- double t1, t2, avgWidth, w;
- int n, i;
-
gfxFont = state->getFont();
- textMat = state->getTextMat();
- horizScaling = state->getHorizScaling();
- if ((t1 = fabs(textMat[0])) > 0.01 &&
- (t2 = fabs(textMat[3])) > 0.01) {
- horizScaling *= t1 / t2;
- }
-
- minSpaceWidth = horizScaling * wordDefMinSpaceWidth;
- maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth;
- if (gfxFont && gfxFont->isCIDFont()) {
- //~ handle 16-bit fonts
- } else if (gfxFont && gfxFont->getType() != fontType3) {
- avgWidth = 0;
- n = 0;
- for (i = 0; i < 256; ++i) {
- w = ((Gfx8BitFont *)gfxFont)->getWidth(i);
- if (w > 0) {
- avgWidth += w;
- ++n;
- }
- }
- if (n > 0) {
- avgWidth /= n;
- minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth;
- maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth;
- }
- }
-
+#if TEXTOUT_WORD_LIST
+ fontName = (gfxFont && gfxFont->getOrigName())
+ ? gfxFont->getOrigName()->copy()
+ : (GString *)NULL;
+#endif
}
TextFontInfo::~TextFontInfo() {
+#if TEXTOUT_WORD_LIST
+ if (fontName) {
+ delete fontName;
+ }
+#endif
}
GBool TextFontInfo::matches(GfxState *state) {
- double *textMat;
- double t1, t2, h;
-
- textMat = state->getTextMat();
- h = state->getHorizScaling();
- if ((t1 = fabs(textMat[0])) > 0.01 &&
- (t2 = fabs(textMat[3])) > 0.01) {
- h *= t1 / t2;
- }
- return state->getFont() == gfxFont &&
- fabs(h - horizScaling) < 0.01;
+ return state->getFont() == gfxFont;
}
//------------------------------------------------------------------------
// TextWord
//------------------------------------------------------------------------
-TextWord::TextWord(GfxState *state, double x0, double y0, int charPosA,
- TextFontInfo *fontA, double fontSizeA) {
+TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
+ int charPosA, TextFontInfo *fontA, double fontSizeA) {
GfxFont *gfxFont;
- double x, y;
+ double x, y, ascent, descent;
+ rot = rotA;
charPos = charPosA;
charLen = 0;
font = fontA;
fontSize = fontSizeA;
state->transform(x0, y0, &x, &y);
if ((gfxFont = font->gfxFont)) {
- yMin = y - gfxFont->getAscent() * fontSize;
- yMax = y - gfxFont->getDescent() * fontSize;
+ ascent = gfxFont->getAscent() * fontSize;
+ descent = gfxFont->getDescent() * fontSize;
} else {
// this means that the PDF file draws text without a current font,
// which should never happen
- yMin = y - 0.95 * fontSize;
- yMax = y + 0.35 * fontSize;
- }
- if (yMin == yMax) {
- // this is a sanity check for a case that shouldn't happen -- but
- // if it does happen, we want to avoid dividing by zero later
- yMin = y;
- yMax = y + 1;
+ ascent = 0.95 * fontSize;
+ descent = -0.35 * fontSize;
+ }
+ switch (rot) {
+ case 0:
+ yMin = y - ascent;
+ yMax = y - descent;
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ base = y;
+ break;
+ case 1:
+ xMin = x + descent;
+ xMax = x + ascent;
+ if (xMin == xMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ xMin = x;
+ xMax = x + 1;
+ }
+ base = x;
+ break;
+ case 2:
+ yMin = y + descent;
+ yMax = y + ascent;
+ if (yMin == yMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ yMin = y;
+ yMax = y + 1;
+ }
+ base = y;
+ break;
+ case 3:
+ xMin = x - ascent;
+ xMax = x - descent;
+ if (xMin == xMax) {
+ // this is a sanity check for a case that shouldn't happen -- but
+ // if it does happen, we want to avoid dividing by zero later
+ xMin = x;
+ xMax = x + 1;
+ }
+ base = x;
+ break;
}
- yBase = y;
text = NULL;
- xRight = NULL;
+ edge = NULL;
len = size = 0;
spaceAfter = gFalse;
next = NULL;
-}
+#if TEXTOUT_WORD_LIST
+ GfxRGB rgb;
+ if ((state->getRender() & 3) == 1) {
+ state->getStrokeRGB(&rgb);
+ } else {
+ state->getFillRGB(&rgb);
+ }
+ colorR = rgb.r;
+ colorG = rgb.g;
+ colorB = rgb.b;
+#endif
+}
TextWord::~TextWord() {
gfree(text);
- gfree(xRight);
+ gfree(edge);
}
void TextWord::addChar(GfxState *state, double x, double y,
@@ -217,234 +229,1450 @@ void TextWord::addChar(GfxState *state, double x, double y,
if (len == size) {
size += 16;
text = (Unicode *)grealloc(text, size * sizeof(Unicode));
- xRight = (double *)grealloc(xRight, size * sizeof(double));
+ edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
}
text[len] = u;
- if (len == 0) {
- xMin = x;
+ switch (rot) {
+ case 0:
+ if (len == 0) {
+ xMin = x;
+ }
+ edge[len] = x;
+ xMax = edge[len+1] = x + dx;
+ break;
+ case 1:
+ if (len == 0) {
+ yMin = y;
+ }
+ edge[len] = y;
+ yMax = edge[len+1] = y + dy;
+ break;
+ case 2:
+ if (len == 0) {
+ xMax = x;
+ }
+ edge[len] = x;
+ xMin = edge[len+1] = x + dx;
+ break;
+ case 3:
+ if (len == 0) {
+ yMax = y;
+ }
+ edge[len] = y;
+ yMin = edge[len+1] = y + dy;
+ break;
}
- xMax = xRight[len] = x + dx;
++len;
}
-// Returns true if <this> comes before <word2> in xy order.
-GBool TextWord::xyBefore(TextWord *word2) {
- return xMin < word2->xMin ||
- (xMin == word2->xMin && yMin < word2->yMin);
-}
-
-// Merge another word onto the end of this one.
-void TextWord::merge(TextWord *word2) {
+void TextWord::merge(TextWord *word) {
int i;
- xMax = word2->xMax;
- if (word2->yMin < yMin) {
- yMin = word2->yMin;
+ if (word->xMin < xMin) {
+ xMin = word->xMin;
+ }
+ if (word->yMin < yMin) {
+ yMin = word->yMin;
}
- if (word2->yMax > yMax) {
- yMax = word2->yMax;
+ if (word->xMax > xMax) {
+ xMax = word->xMax;
}
- if (len + word2->len > size) {
- size = len + word2->len;
+ if (word->yMax > yMax) {
+ yMax = word->yMax;
+ }
+ if (len + word->len > size) {
+ size = len + word->len;
text = (Unicode *)grealloc(text, size * sizeof(Unicode));
- xRight = (double *)grealloc(xRight, size * sizeof(double));
+ edge = (double *)grealloc(edge, (size + 1) * sizeof(double));
+ }
+ for (i = 0; i < word->len; ++i) {
+ text[len + i] = word->text[i];
+ edge[len + i] = word->edge[i];
+ }
+ edge[len + word->len] = word->edge[word->len];
+ len += word->len;
+ charLen += word->charLen;
+}
+
+inline int TextWord::primaryCmp(TextWord *word) {
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ cmp = xMin - word->xMin;
+ break;
+ case 1:
+ cmp = yMin - word->yMin;
+ break;
+ case 2:
+ cmp = word->xMax - xMax;
+ break;
+ case 3:
+ cmp = word->yMax - yMax;
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+double TextWord::primaryDelta(TextWord *word) {
+ double delta;
+
+ delta = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ delta = word->xMin - xMax;
+ break;
+ case 1:
+ delta = word->yMin - yMax;
+ break;
+ case 2:
+ delta = xMin - word->xMax;
+ break;
+ case 3:
+ delta = yMin - word->yMax;
+ break;
+ }
+ return delta;
+}
+
+int TextWord::cmpYX(const void *p1, const void *p2) {
+ TextWord *word1 = *(TextWord **)p1;
+ TextWord *word2 = *(TextWord **)p2;
+ double cmp;
+
+ cmp = word1->yMin - word2->yMin;
+ if (cmp == 0) {
+ cmp = word1->xMin - word2->xMin;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+#if TEXTOUT_WORD_LIST
+
+GString *TextWord::getText() {
+ GString *s;
+ UnicodeMap *uMap;
+ char buf[8];
+ int n, i;
+
+ s = new GString();
+ if (!(uMap = globalParams->getTextEncoding())) {
+ return s;
}
- for (i = 0; i < word2->len; ++i) {
- text[len + i] = word2->text[i];
- xRight[len + i] = word2->xRight[i];
+ for (i = 0; i < len; ++i) {
+ n = uMap->mapUnicode(text[i], buf, sizeof(buf));
+ s->append(buf, n);
}
- len += word2->len;
- charLen += word2->charLen;
+ uMap->decRefCnt();
+ return s;
+}
+
+#endif // TEXTOUT_WORD_LIST
+
+//------------------------------------------------------------------------
+// TextPool
+//------------------------------------------------------------------------
+
+TextPool::TextPool() {
+ minBaseIdx = 0;
+ maxBaseIdx = -1;
+ pool = NULL;
+ cursor = NULL;
+ cursorBaseIdx = -1;
+}
+
+TextPool::~TextPool() {
+ int baseIdx;
+ TextWord *word, *word2;
+
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+ for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
+ word2 = word->next;
+ delete word;
+ }
+ }
+ gfree(pool);
+}
+
+int TextPool::getBaseIdx(double base) {
+ int baseIdx;
+
+ baseIdx = (int)(base / textPoolStep);
+ if (baseIdx < minBaseIdx) {
+ return minBaseIdx;
+ }
+ if (baseIdx > maxBaseIdx) {
+ return maxBaseIdx;
+ }
+ return baseIdx;
+}
+
+void TextPool::addWord(TextWord *word) {
+ TextWord **newPool;
+ int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
+ TextWord *w0, *w1;
+
+ // expand the array if needed
+ wordBaseIdx = (int)(word->base / textPoolStep);
+ if (minBaseIdx > maxBaseIdx) {
+ minBaseIdx = wordBaseIdx - 128;
+ maxBaseIdx = wordBaseIdx + 128;
+ pool = (TextWord **)gmalloc((maxBaseIdx - minBaseIdx + 1) *
+ sizeof(TextWord *));
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+ pool[baseIdx - minBaseIdx] = NULL;
+ }
+ } else if (wordBaseIdx < minBaseIdx) {
+ newMinBaseIdx = wordBaseIdx - 128;
+ newPool = (TextWord **)gmalloc((maxBaseIdx - newMinBaseIdx + 1) *
+ sizeof(TextWord *));
+ for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
+ newPool[baseIdx - newMinBaseIdx] = NULL;
+ }
+ memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
+ (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
+ gfree(pool);
+ pool = newPool;
+ minBaseIdx = newMinBaseIdx;
+ } else if (wordBaseIdx > maxBaseIdx) {
+ newMaxBaseIdx = wordBaseIdx + 128;
+ pool = (TextWord **)grealloc(pool, (newMaxBaseIdx - minBaseIdx + 1) *
+ sizeof(TextWord *));
+ for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
+ pool[baseIdx - minBaseIdx] = NULL;
+ }
+ maxBaseIdx = newMaxBaseIdx;
+ }
+
+ // insert the new word
+ if (cursor && wordBaseIdx == cursorBaseIdx &&
+ word->primaryCmp(cursor) > 0) {
+ w0 = cursor;
+ w1 = cursor->next;
+ } else {
+ w0 = NULL;
+ w1 = pool[wordBaseIdx - minBaseIdx];
+ }
+ for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
+ word->next = w1;
+ if (w0) {
+ w0->next = word;
+ } else {
+ pool[wordBaseIdx - minBaseIdx] = word;
+ }
+ cursor = word;
+ cursorBaseIdx = wordBaseIdx;
}
//------------------------------------------------------------------------
// TextLine
//------------------------------------------------------------------------
-TextLine::TextLine() {
- words = NULL;
+TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
+ blk = blkA;
+ rot = rotA;
+ xMin = yMin = 0;
+ xMax = yMax = -1;
+ base = baseA;
+ words = lastWord = NULL;
text = NULL;
- xRight = NULL;
+ edge = NULL;
col = NULL;
len = 0;
+ convertedLen = 0;
hyphenated = gFalse;
- pageNext = NULL;
next = NULL;
- flowNext = NULL;
}
TextLine::~TextLine() {
- TextWord *w1, *w2;
+ TextWord *word;
- for (w1 = words; w1; w1 = w2) {
- w2 = w1->next;
- delete w1;
+ while (words) {
+ word = words;
+ words = words->next;
+ delete word;
}
gfree(text);
- gfree(xRight);
+ gfree(edge);
gfree(col);
}
-// Returns true if <this> comes before <line2> in yx order, allowing
-// slack for vertically overlapping lines.
-GBool TextLine::yxBefore(TextLine *line2) {
- double dy;
+void TextLine::addWord(TextWord *word) {
+ if (lastWord) {
+ lastWord->next = word;
+ } else {
+ words = word;
+ }
+ lastWord = word;
+
+ if (xMin > xMax) {
+ xMin = word->xMin;
+ xMax = word->xMax;
+ yMin = word->yMin;
+ yMax = word->yMax;
+ } else {
+ if (word->xMin < xMin) {
+ xMin = word->xMin;
+ }
+ if (word->xMax > xMax) {
+ xMax = word->xMax;
+ }
+ if (word->yMin < yMin) {
+ yMin = word->yMin;
+ }
+ if (word->yMax > yMax) {
+ yMax = word->yMax;
+ }
+ }
+}
+
+double TextLine::primaryDelta(TextLine *line) {
+ double delta;
+
+ delta = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ delta = line->xMin - xMax;
+ break;
+ case 1:
+ delta = line->yMin - yMax;
+ break;
+ case 2:
+ delta = xMin - line->xMax;
+ break;
+ case 3:
+ delta = yMin - line->yMax;
+ break;
+ }
+ return delta;
+}
+
+int TextLine::primaryCmp(TextLine *line) {
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ cmp = xMin - line->xMin;
+ break;
+ case 1:
+ cmp = yMin - line->yMin;
+ break;
+ case 2:
+ cmp = line->xMax - xMax;
+ break;
+ case 3:
+ cmp = line->yMax - yMax;
+ break;
+ }
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextLine::secondaryCmp(TextLine *line) {
+ double cmp;
+
+ cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
- dy = lineOverlapSlack * fontSize;
+int TextLine::cmpYX(TextLine *line) {
+ int cmp;
- // non-overlapping case
- if (line2->yMin > yMax - dy ||
- line2->yMax < yMin + dy) {
- return yMin < line2->yMin ||
- (yMin == line2->yMin && xMin < line2->xMin);
+ if ((cmp = secondaryCmp(line))) {
+ return cmp;
}
+ return primaryCmp(line);
+}
+
+int TextLine::cmpXY(const void *p1, const void *p2) {
+ TextLine *line1 = *(TextLine **)p1;
+ TextLine *line2 = *(TextLine **)p2;
+ int cmp;
+
+ if ((cmp = line1->primaryCmp(line2))) {
+ return cmp;
+ }
+ return line1->secondaryCmp(line2);
+}
+
+void TextLine::coalesce(UnicodeMap *uMap) {
+ TextWord *word0, *word1;
+ double space, delta, minSpace;
+ GBool isUnicode;
+ char buf[8];
+ int i, j;
+
+ if (words->next) {
+
+ // compute the inter-word space threshold
+ if (words->len > 1 || words->next->len > 1) {
+ minSpace = 0;
+ } else {
+ minSpace = words->primaryDelta(words->next);
+ for (word0 = words->next, word1 = word0->next;
+ word1 && minSpace > 0;
+ word0 = word1, word1 = word0->next) {
+ if (word1->len > 1) {
+ minSpace = 0;
+ }
+ delta = word0->primaryDelta(word1);
+ if (delta < minSpace) {
+ minSpace = delta;
+ }
+ }
+ }
+ if (minSpace <= 0) {
+ space = maxCharSpacing * words->fontSize;
+ } else {
+ space = maxWideCharSpacingMul * minSpace;
+ }
+
+ // merge words
+ word0 = words;
+ word1 = words->next;
+ while (word1) {
+ if (word0->primaryDelta(word1) >= space) {
+ word0->spaceAfter = gTrue;
+ word0 = word1;
+ word1 = word1->next;
+ } else if (word0->font == word1->font &&
+ fabs(word0->fontSize - word1->fontSize) <
+ maxWordFontSizeDelta * words->fontSize &&
+ word1->charPos == word0->charPos + word0->charLen) {
+ word0->merge(word1);
+ word0->next = word1->next;
+ delete word1;
+ word1 = word0->next;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
+ }
+
+ // build the line text
+ isUnicode = uMap ? uMap->isUnicode() : gFalse;
+ len = 0;
+ for (word1 = words; word1; word1 = word1->next) {
+ len += word1->len;
+ if (word1->spaceAfter) {
+ ++len;
+ }
+ }
+ text = (Unicode *)gmalloc(len * sizeof(Unicode));
+ edge = (double *)gmalloc((len + 1) * sizeof(double));
+ i = 0;
+ for (word1 = words; word1; word1 = word1->next) {
+ for (j = 0; j < word1->len; ++j) {
+ text[i] = word1->text[j];
+ edge[i] = word1->edge[j];
+ ++i;
+ }
+ edge[i] = word1->edge[word1->len];
+ if (word1->spaceAfter) {
+ text[i] = (Unicode)0x0020;
+ ++i;
+ }
+ }
+
+ // compute convertedLen and set up the col array
+ col = (int *)gmalloc((len + 1) * sizeof(int));
+ convertedLen = 0;
+ for (i = 0; i < len; ++i) {
+ col[i] = convertedLen;
+ if (isUnicode) {
+ ++convertedLen;
+ } else if (uMap) {
+ convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
+ }
+ }
+ col[len] = convertedLen;
+
+ // check for hyphen at end of line
+ //~ need to check for other chars used as hyphens
+ hyphenated = text[len - 1] == (Unicode)'-';
+}
+
+//------------------------------------------------------------------------
+// TextLineFrag
+//------------------------------------------------------------------------
+
+class TextLineFrag {
+public:
+
+ TextLine *line; // the line object
+ int start, len; // offset and length of this fragment
+ // (in Unicode chars)
+ double xMin, xMax; // bounding box coordinates
+ double yMin, yMax;
+ double base; // baseline virtual coordinate
+ int col; // first column
+
+ void init(TextLine *lineA, int startA, int lenA);
+ void computeCoords(GBool oneRot);
+
+ static int cmpYXPrimaryRot(const void *p1, const void *p2);
+ static int cmpYXLineRot(const void *p1, const void *p2);
+ static int cmpXYLineRot(const void *p1, const void *p2);
+};
+
+void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
+ line = lineA;
+ start = startA;
+ len = lenA;
+ col = line->col[start];
+}
+
+void TextLineFrag::computeCoords(GBool oneRot) {
+ TextBlock *blk;
+ double d0, d1, d2, d3, d4;
+
+ if (oneRot) {
+
+ switch (line->rot) {
+ case 0:
+ xMin = line->edge[start];
+ xMax = line->edge[start + len];
+ yMin = line->yMin;
+ yMax = line->yMax;
+ break;
+ case 1:
+ xMin = line->xMin;
+ xMax = line->xMax;
+ yMin = line->edge[start];
+ yMax = line->edge[start + len];
+ break;
+ case 2:
+ xMin = line->edge[start + len];
+ xMax = line->edge[start];
+ yMin = line->yMin;
+ yMax = line->yMax;
+ break;
+ case 3:
+ xMin = line->xMin;
+ xMax = line->xMax;
+ yMin = line->edge[start + len];
+ yMax = line->edge[start];
+ break;
+ }
+ base = line->base;
+
+ } else {
+
+ if (line->rot == 0 && line->blk->page->primaryRot == 0) {
+
+ xMin = line->edge[start];
+ xMax = line->edge[start + len];
+ yMin = line->yMin;
+ yMax = line->yMax;
+ base = line->base;
+
+ } else {
+
+ blk = line->blk;
+ d0 = line->edge[start];
+ d1 = line->edge[start + len];
+ d2 = d3 = d4 = 0; // make gcc happy
+
+ switch (line->rot) {
+ case 0:
+ d2 = line->yMin;
+ d3 = line->yMax;
+ d4 = line->base;
+ d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
+ d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
+ d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
+ d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
+ d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
+ break;
+ case 1:
+ d2 = line->xMax;
+ d3 = line->xMin;
+ d4 = line->base;
+ d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
+ d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
+ d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
+ d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
+ d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
+ break;
+ case 2:
+ d2 = line->yMax;
+ d3 = line->yMin;
+ d4 = line->base;
+ d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
+ d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
+ d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
+ d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
+ d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
+ break;
+ case 3:
+ d2 = line->xMin;
+ d3 = line->xMax;
+ d4 = line->base;
+ d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
+ d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
+ d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
+ d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
+ d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
+ break;
+ }
+
+ switch (line->blk->page->primaryRot) {
+ case 0:
+ xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
+ xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
+ yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
+ yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
+ base = blk->yMin + base * (blk->yMax - blk->yMin);
+ break;
+ case 1:
+ xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
+ xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
+ yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
+ yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
+ base = blk->xMax - d4 * (blk->xMax - blk->xMin);
+ break;
+ case 2:
+ xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
+ xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
+ yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
+ yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
+ base = blk->yMax - d4 * (blk->yMax - blk->yMin);
+ break;
+ case 3:
+ xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
+ xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
+ yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
+ yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
+ base = blk->xMin + d4 * (blk->xMax - blk->xMin);
+ break;
+ }
- // overlapping case
- return xMin < line2->xMin;
+ }
+ }
}
-// Merge another line's words onto the end of this line.
-void TextLine::merge(TextLine *line2) {
- int newLen, i;
+int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
+ double cmp;
- xMax = line2->xMax;
- if (line2->yMin < yMin) {
- yMin = line2->yMin;
+ cmp = 0; // make gcc happy
+ switch (frag1->line->blk->page->primaryRot) {
+ case 0:
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
+ cmp = frag1->xMin - frag2->xMin;
+ }
+ break;
+ case 1:
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
+ cmp = frag1->yMin - frag2->yMin;
+ }
+ break;
+ case 2:
+ if ((cmp = frag2->yMin - frag1->yMin) == 0) {
+ cmp = frag2->xMax - frag1->xMax;
+ }
+ break;
+ case 3:
+ if ((cmp = frag1->xMax - frag2->xMax) == 0) {
+ cmp = frag2->yMax - frag1->yMax;
+ }
+ break;
}
- if (line2->yMax > yMax) {
- yMax = line2->yMax;
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (frag1->line->rot) {
+ case 0:
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
+ cmp = frag1->xMin - frag2->xMin;
+ }
+ break;
+ case 1:
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
+ cmp = frag1->yMin - frag2->yMin;
+ }
+ break;
+ case 2:
+ if ((cmp = frag2->yMin - frag1->yMin) == 0) {
+ cmp = frag2->xMax - frag1->xMax;
+ }
+ break;
+ case 3:
+ if ((cmp = frag1->xMax - frag2->xMax) == 0) {
+ cmp = frag2->yMax - frag1->yMax;
+ }
+ break;
}
- xSpaceR = line2->xSpaceR;
- lastWord->spaceAfter = gTrue;
- lastWord->next = line2->words;
- lastWord = line2->lastWord;
- line2->words = NULL;
- newLen = len + 1 + line2->len;
- text = (Unicode *)grealloc(text, newLen * sizeof(Unicode));
- xRight = (double *)grealloc(xRight, newLen * sizeof(double));
- text[len] = (Unicode)0x0020;
- xRight[len] = line2->xMin;
- for (i = 0; i < line2->len; ++i) {
- text[len + 1 + i] = line2->text[i];
- xRight[len + 1 + i] = line2->xRight[i];
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
+ TextLineFrag *frag1 = (TextLineFrag *)p1;
+ TextLineFrag *frag2 = (TextLineFrag *)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (frag1->line->rot) {
+ case 0:
+ if ((cmp = frag1->xMin - frag2->xMin) == 0) {
+ cmp = frag1->yMin - frag2->yMin;
+ }
+ break;
+ case 1:
+ if ((cmp = frag1->yMin - frag2->yMin) == 0) {
+ cmp = frag2->xMax - frag1->xMax;
+ }
+ break;
+ case 2:
+ if ((cmp = frag2->xMax - frag1->xMax) == 0) {
+ cmp = frag2->yMin - frag1->yMin;
+ }
+ break;
+ case 3:
+ if ((cmp = frag2->yMax - frag1->yMax) == 0) {
+ cmp = frag1->xMax - frag2->xMax;
+ }
+ break;
}
- len = newLen;
- convertedLen += line2->convertedLen;
- hyphenated = line2->hyphenated;
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
}
//------------------------------------------------------------------------
// TextBlock
//------------------------------------------------------------------------
-TextBlock::TextBlock() {
+TextBlock::TextBlock(TextPage *pageA, int rotA) {
+ page = pageA;
+ rot = rotA;
+ xMin = yMin = 0;
+ xMax = yMax = -1;
+ priMin = 0;
+ priMax = page->pageWidth;
+ pool = new TextPool();
lines = NULL;
+ curLine = NULL;
next = NULL;
+ stackNext = NULL;
}
TextBlock::~TextBlock() {
- TextLine *l1, *l2;
+ TextLine *line;
- for (l1 = lines; l1; l1 = l2) {
- l2 = l1->next;
- delete l1;
+ delete pool;
+ while (lines) {
+ line = lines;
+ lines = lines->next;
+ delete line;
}
}
-// Returns true if <this> comes before <blk2> in xy order, allowing
-// slack for vertically overlapping blocks.
-GBool TextBlock::yxBefore(TextBlock *blk2) {
- double dy;
+void TextBlock::addWord(TextWord *word) {
+ pool->addWord(word);
+ if (xMin > xMax) {
+ xMin = word->xMin;
+ xMax = word->xMax;
+ yMin = word->yMin;
+ yMax = word->yMax;
+ } else {
+ if (word->xMin < xMin) {
+ xMin = word->xMin;
+ }
+ if (word->xMax > xMax) {
+ xMax = word->xMax;
+ }
+ if (word->yMin < yMin) {
+ yMin = word->yMin;
+ }
+ if (word->yMax > yMax) {
+ yMax = word->yMax;
+ }
+ }
+}
- dy = blkOverlapSlack * lines->fontSize;
+void TextBlock::coalesce(UnicodeMap *uMap) {
+ TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
+ TextLine *line, *line0, *line1;
+ int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
+ int baseIdx, bestWordBaseIdx, idx0, idx1;
+ double minBase, maxBase;
+ double fontSize, delta, priDelta, secDelta;
+ TextLine **lineArray;
+ GBool found;
+ int col1, col2;
+ int i, j, k;
+
+ // discard duplicated text (fake boldface, drop shadows)
+ for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
+ word0 = pool->getPool(idx0);
+ while (word0) {
+ priDelta = dupMaxPriDelta * word0->fontSize;
+ secDelta = dupMaxSecDelta * word0->fontSize;
+ if (rot == 0 || rot == 3) {
+ maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
+ } else {
+ maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
+ }
+ found = gFalse;
+ word1 = word2 = NULL; // make gcc happy
+ for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
+ if (idx1 == idx0) {
+ word1 = word0;
+ word2 = word0->next;
+ } else {
+ word1 = NULL;
+ word2 = pool->getPool(idx1);
+ }
+ for (; word2; word1 = word2, word2 = word2->next) {
+ if (word2->len == word0->len &&
+ !memcmp(word2->text, word0->text,
+ word0->len * sizeof(Unicode))) {
+ switch (rot) {
+ case 0:
+ case 2:
+ found = fabs(word0->xMin - word2->xMin) < priDelta &&
+ fabs(word0->xMax - word2->xMax) < priDelta &&
+ fabs(word0->yMin - word2->yMin) < secDelta &&
+ fabs(word0->yMax - word2->yMax) < secDelta;
+ break;
+ case 1:
+ case 3:
+ found = fabs(word0->xMin - word2->xMin) < secDelta &&
+ fabs(word0->xMax - word2->xMax) < secDelta &&
+ fabs(word0->yMin - word2->yMin) < priDelta &&
+ fabs(word0->yMax - word2->yMax) < priDelta;
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (found) {
+ break;
+ }
+ }
+ if (found) {
+ if (word1) {
+ word1->next = word2->next;
+ } else {
+ pool->setPool(idx1, word2->next);
+ }
+ delete word2;
+ } else {
+ word0 = word0->next;
+ }
+ }
+ }
+
+ // build the lines
+ curLine = NULL;
+ poolMinBaseIdx = pool->minBaseIdx;
+ charCount = 0;
+ nLines = 0;
+ while (1) {
- // non-overlapping case
- if (blk2->yMin > yMax - dy ||
- blk2->yMax < yMin + dy) {
- return yMin < blk2->yMin ||
- (yMin == blk2->yMin && xMin < blk2->xMin);
+ // find the first non-empty line in the pool
+ for (;
+ poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
+ ++poolMinBaseIdx) ;
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
+ break;
+ }
+
+ // look for the left-most word in the first four lines of the
+ // pool -- this avoids starting with a superscript word
+ startBaseIdx = poolMinBaseIdx;
+ for (baseIdx = poolMinBaseIdx + 1;
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
+ ++baseIdx) {
+ if (!pool->getPool(baseIdx)) {
+ continue;
+ }
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
+ < 0) {
+ startBaseIdx = baseIdx;
+ }
+ }
+
+ // create a new line
+ word0 = pool->getPool(startBaseIdx);
+ pool->setPool(startBaseIdx, word0->next);
+ word0->next = NULL;
+ line = new TextLine(this, word0->rot, word0->base);
+ line->addWord(word0);
+ lastWord = word0;
+
+ // compute the search range
+ fontSize = word0->fontSize;
+ minBase = word0->base - maxIntraLineDelta * fontSize;
+ maxBase = word0->base + maxIntraLineDelta * fontSize;
+ minBaseIdx = pool->getBaseIdx(minBase);
+ maxBaseIdx = pool->getBaseIdx(maxBase);
+
+ // find the rest of the words in this line
+ while (1) {
+
+ // find the left-most word whose baseline is in the range for
+ // this line
+ bestWordBaseIdx = 0;
+ bestWord0 = bestWord1 = NULL;
+ for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
+ for (word0 = NULL, word1 = pool->getPool(baseIdx);
+ word1;
+ word0 = word1, word1 = word1->next) {
+ if (word1->base >= minBase &&
+ word1->base <= maxBase &&
+ (delta = lastWord->primaryDelta(word1)) >=
+ minCharSpacing * fontSize) {
+ if (delta < maxWordSpacing * fontSize &&
+ (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
+ bestWordBaseIdx = baseIdx;
+ bestWord0 = word0;
+ bestWord1 = word1;
+ }
+ break;
+ }
+ }
+ }
+ if (!bestWord1) {
+ break;
+ }
+
+ // remove it from the pool, and add it to the line
+ if (bestWord0) {
+ bestWord0->next = bestWord1->next;
+ } else {
+ pool->setPool(bestWordBaseIdx, bestWord1->next);
+ }
+ bestWord1->next = NULL;
+ line->addWord(bestWord1);
+ lastWord = bestWord1;
+ }
+
+ // add the line
+ if (curLine && line->cmpYX(curLine) > 0) {
+ line0 = curLine;
+ line1 = curLine->next;
+ } else {
+ line0 = NULL;
+ line1 = lines;
+ }
+ for (;
+ line1 && line->cmpYX(line1) > 0;
+ line0 = line1, line1 = line1->next) ;
+ if (line0) {
+ line0->next = line;
+ } else {
+ lines = line;
+ }
+ line->next = line1;
+ curLine = line;
+ line->coalesce(uMap);
+ charCount += line->len;
+ ++nLines;
}
- // overlapping case
- return xMin < blk2->xMin;
+ // sort lines into xy order for column assignment
+ lineArray = (TextLine **)gmalloc(nLines * sizeof(TextLine *));
+ for (line = lines, i = 0; line; line = line->next, ++i) {
+ lineArray[i] = line;
+ }
+ qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
+
+ // column assignment
+ nColumns = 0;
+ for (i = 0; i < nLines; ++i) {
+ line0 = lineArray[i];
+ col1 = 0;
+ for (j = 0; j < i; ++j) {
+ line1 = lineArray[j];
+ if (line1->primaryDelta(line0) >= 0) {
+ col2 = line1->col[line1->len] + 1;
+ } else {
+ k = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ for (k = 0;
+ k < line1->len &&
+ line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 1:
+ for (k = 0;
+ k < line1->len &&
+ line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 2:
+ for (k = 0;
+ k < line1->len &&
+ line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ case 3:
+ for (k = 0;
+ k < line1->len &&
+ line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
+ ++k) ;
+ break;
+ }
+ col2 = line1->col[k];
+ }
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ for (k = 0; k <= line0->len; ++k) {
+ line0->col[k] += col1;
+ }
+ if (line0->col[line0->len] > nColumns) {
+ nColumns = line0->col[line0->len];
+ }
+ }
+ gfree(lineArray);
}
-// Merge another block's line onto the right of this one.
-void TextBlock::mergeRight(TextBlock *blk2) {
- lines->merge(blk2->lines);
- xMax = lines->xMax;
- yMin = lines->yMin;
- yMax = lines->yMax;
- xSpaceR = lines->xSpaceR;
+void TextBlock::updatePriMinMax(TextBlock *blk) {
+ double newPriMin, newPriMax;
+ GBool gotPriMin, gotPriMax;
+
+ gotPriMin = gotPriMax = gFalse;
+ newPriMin = newPriMax = 0; // make gcc happy
+ switch (page->primaryRot) {
+ case 0:
+ case 2:
+ if (blk->yMin < yMax && blk->yMax > yMin) {
+ if (blk->xMin < xMin) {
+ newPriMin = blk->xMax;
+ gotPriMin = gTrue;
+ }
+ if (blk->xMax > xMax) {
+ newPriMax = blk->xMin;
+ gotPriMax = gTrue;
+ }
+ }
+ break;
+ case 1:
+ case 3:
+ if (blk->xMin < xMax && blk->xMax > xMin) {
+ if (blk->yMin < yMin) {
+ newPriMin = blk->yMax;
+ gotPriMin = gTrue;
+ }
+ if (blk->yMax > yMax) {
+ newPriMax = blk->yMin;
+ gotPriMax = gTrue;
+ }
+ }
+ break;
+ }
+ if (gotPriMin) {
+ if (newPriMin > xMin) {
+ newPriMin = xMin;
+ }
+ if (newPriMin > priMin) {
+ priMin = newPriMin;
+ }
+ }
+ if (gotPriMax) {
+ if (newPriMax < xMax) {
+ newPriMax = xMax;
+ }
+ if (newPriMax < priMax) {
+ priMax = newPriMax;
+ }
+ }
}
-// Merge another block's lines onto the bottom of this block.
-void TextBlock::mergeBelow(TextBlock *blk2) {
- TextLine *line;
+int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
+ TextBlock *blk1 = *(TextBlock **)p1;
+ TextBlock *blk2 = *(TextBlock **)p2;
+ double cmp;
- if (blk2->xMin < xMin) {
- xMin = blk2->xMin;
+ cmp = 0; // make gcc happy
+ switch (blk1->page->primaryRot) {
+ case 0:
+ if ((cmp = blk1->xMin - blk2->xMin) == 0) {
+ cmp = blk1->yMin - blk2->yMin;
+ }
+ break;
+ case 1:
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
+ cmp = blk2->xMax - blk1->xMax;
+ }
+ break;
+ case 2:
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
+ cmp = blk2->yMin - blk1->yMin;
+ }
+ break;
+ case 3:
+ if ((cmp = blk2->yMax - blk1->yMax) == 0) {
+ cmp = blk1->xMax - blk2->xMax;
+ }
+ break;
}
- if (blk2->xMax > xMax) {
- xMax = blk2->xMax;
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
+ TextBlock *blk1 = *(TextBlock **)p1;
+ TextBlock *blk2 = *(TextBlock **)p2;
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (blk1->page->primaryRot) {
+ case 0:
+ if ((cmp = blk1->yMin - blk2->yMin) == 0) {
+ cmp = blk1->xMin - blk2->xMin;
+ }
+ break;
+ case 1:
+ if ((cmp = blk2->xMax - blk1->xMax) == 0) {
+ cmp = blk1->yMin - blk2->yMin;
+ }
+ break;
+ case 2:
+ if ((cmp = blk2->yMin - blk1->yMin) == 0) {
+ cmp = blk2->xMax - blk1->xMax;
+ }
+ break;
+ case 3:
+ if ((cmp = blk1->xMax - blk2->xMax) == 0) {
+ cmp = blk2->yMax - blk1->yMax;
+ }
+ break;
}
- yMax = blk2->yMax;
- if (blk2->xSpaceL > xSpaceL) {
- xSpaceL = blk2->xSpaceL;
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+int TextBlock::primaryCmp(TextBlock *blk) {
+ double cmp;
+
+ cmp = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ cmp = xMin - blk->xMin;
+ break;
+ case 1:
+ cmp = yMin - blk->yMin;
+ break;
+ case 2:
+ cmp = blk->xMax - xMax;
+ break;
+ case 3:
+ cmp = blk->yMax - yMax;
+ break;
}
- if (blk2->xSpaceR < xSpaceR) {
- xSpaceR = blk2->xSpaceR;
+ return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
+}
+
+double TextBlock::secondaryDelta(TextBlock *blk) {
+ double delta;
+
+ delta = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ delta = blk->yMin - yMax;
+ break;
+ case 1:
+ delta = xMin - blk->xMax;
+ break;
+ case 2:
+ delta = yMin - blk->yMax;
+ break;
+ case 3:
+ delta = blk->xMin - xMax;
+ break;
}
- if (blk2->maxFontSize > maxFontSize) {
- maxFontSize = blk2->maxFontSize;
+ return delta;
+}
+
+GBool TextBlock::isBelow(TextBlock *blk) {
+ GBool below;
+
+ below = gFalse; // make gcc happy
+ switch (page->primaryRot) {
+ case 0:
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
+ yMin > blk->yMin;
+ break;
+ case 1:
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
+ xMax < blk->xMax;
+ break;
+ case 2:
+ below = xMin >= blk->priMin && xMax <= blk->priMax &&
+ yMax < blk->yMax;
+ break;
+ case 3:
+ below = yMin >= blk->priMin && yMax <= blk->priMax &&
+ xMin > blk->xMin;
+ break;
}
- for (line = lines; line->next; line = line->next) ;
- line->next = line->flowNext = blk2->lines;
- blk2->lines = NULL;
+
+ return below;
}
//------------------------------------------------------------------------
// TextFlow
//------------------------------------------------------------------------
-TextFlow::TextFlow() {
- blocks = NULL;
+TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
+ page = pageA;
+ xMin = blk->xMin;
+ xMax = blk->xMax;
+ yMin = blk->yMin;
+ yMax = blk->yMax;
+ priMin = blk->priMin;
+ priMax = blk->priMax;
+ blocks = lastBlk = blk;
next = NULL;
}
TextFlow::~TextFlow() {
- TextBlock *b1, *b2;
+ TextBlock *blk;
- for (b1 = blocks; b1; b1 = b2) {
- b2 = b1->next;
- delete b1;
+ while (blocks) {
+ blk = blocks;
+ blocks = blocks->next;
+ delete blk;
}
}
+void TextFlow::addBlock(TextBlock *blk) {
+ if (lastBlk) {
+ lastBlk->next = blk;
+ } else {
+ blocks = blk;
+ }
+ lastBlk = blk;
+ if (blk->xMin < xMin) {
+ xMin = blk->xMin;
+ }
+ if (blk->xMax > xMax) {
+ xMax = blk->xMax;
+ }
+ if (blk->yMin < yMin) {
+ yMin = blk->yMin;
+ }
+ if (blk->yMax > yMax) {
+ yMax = blk->yMax;
+ }
+}
+
+GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
+ GBool fits;
+
+ // lower blocks must use smaller fonts
+ if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
+ return gFalse;
+ }
+
+ fits = gFalse; // make gcc happy
+ switch (page->primaryRot) {
+ case 0:
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
+ break;
+ case 1:
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
+ break;
+ case 2:
+ fits = blk->xMin >= priMin && blk->xMax <= priMax;
+ break;
+ case 3:
+ fits = blk->yMin >= priMin && blk->yMax <= priMax;
+ break;
+ }
+ return fits;
+}
+
+#if TEXTOUT_WORD_LIST
+
+//------------------------------------------------------------------------
+// TextWordList
+//------------------------------------------------------------------------
+
+TextWordList::TextWordList(TextPage *text, GBool physLayout) {
+ TextFlow *flow;
+ TextBlock *blk;
+ TextLine *line;
+ TextWord *word;
+ TextWord **wordArray;
+ int nWords, i;
+
+ words = new GList();
+
+ if (text->rawOrder) {
+ for (word = text->rawWords; word; word = word->next) {
+ words->append(word);
+ }
+
+ } else if (physLayout) {
+ // this is inefficient, but it's also the least useful of these
+ // three cases
+ nWords = 0;
+ for (flow = text->flows; flow; flow = flow->next) {
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ for (word = line->words; word; word = word->next) {
+ ++nWords;
+ }
+ }
+ }
+ }
+ wordArray = (TextWord **)gmalloc(nWords * sizeof(TextWord *));
+ i = 0;
+ for (flow = text->flows; flow; flow = flow->next) {
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ for (word = line->words; word; word = word->next) {
+ wordArray[i++] = word;
+ }
+ }
+ }
+ }
+ qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
+ for (i = 0; i < nWords; ++i) {
+ words->append(wordArray[i]);
+ }
+ gfree(wordArray);
+
+ } else {
+ for (flow = text->flows; flow; flow = flow->next) {
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ for (word = line->words; word; word = word->next) {
+ words->append(word);
+ }
+ }
+ }
+ }
+ }
+}
+
+TextWordList::~TextWordList() {
+ delete words;
+}
+
+int TextWordList::getLength() {
+ return words->getLength();
+}
+
+TextWord *TextWordList::get(int idx) {
+ if (idx < 0 || idx >= words->getLength()) {
+ return NULL;
+ }
+ return (TextWord *)words->get(idx);
+}
+
+#endif // TEXTOUT_WORD_LIST
//------------------------------------------------------------------------
// TextPage
//------------------------------------------------------------------------
TextPage::TextPage(GBool rawOrderA) {
+ int rot;
+
rawOrder = rawOrderA;
curWord = NULL;
charPos = 0;
- font = NULL;
- fontSize = 0;
+ curFont = NULL;
+ curFontSize = 0;
nest = 0;
nTinyChars = 0;
- words = wordPtr = NULL;
- lines = NULL;
+ if (!rawOrder) {
+ for (rot = 0; rot < 4; ++rot) {
+ pools[rot] = new TextPool();
+ }
+ }
flows = NULL;
+ blocks = NULL;
+ rawWords = NULL;
+ rawLastWord = NULL;
fonts = new GList();
+ lastFindXMin = lastFindYMin = 0;
+ haveLastFind = gFalse;
}
TextPage::~TextPage() {
+ int rot;
+
clear();
+ if (!rawOrder) {
+ for (rot = 0; rot < 4; ++rot) {
+ delete pools[rot];
+ }
+ }
delete fonts;
}
+void TextPage::startPage(GfxState *state) {
+ clear();
+ if (state) {
+ pageWidth = state->getPageWidth();
+ pageHeight = state->getPageHeight();
+ } else {
+ pageWidth = pageHeight = 0;
+ }
+}
+
+void TextPage::clear() {
+ int rot;
+ TextFlow *flow;
+ TextWord *word;
+
+ if (curWord) {
+ delete curWord;
+ curWord = NULL;
+ }
+ if (rawOrder) {
+ while (rawWords) {
+ word = rawWords;
+ rawWords = rawWords->next;
+ delete word;
+ }
+ } else {
+ for (rot = 0; rot < 4; ++rot) {
+ delete pools[rot];
+ }
+ while (flows) {
+ flow = flows;
+ flows = flows->next;
+ delete flow;
+ }
+ gfree(blocks);
+ }
+ deleteGList(fonts, TextFontInfo);
+
+ curWord = NULL;
+ charPos = 0;
+ curFont = NULL;
+ curFontSize = 0;
+ nest = 0;
+ nTinyChars = 0;
+ if (!rawOrder) {
+ for (rot = 0; rot < 4; ++rot) {
+ pools[rot] = new TextPool();
+ }
+ }
+ flows = NULL;
+ blocks = NULL;
+ rawWords = NULL;
+ rawLastWord = NULL;
+ fonts = new GList();
+}
+
void TextPage::updateFont(GfxState *state) {
GfxFont *gfxFont;
double *fm;
@@ -454,22 +1682,22 @@ void TextPage::updateFont(GfxState *state) {
int i;
// get the font info object
- font = NULL;
+ curFont = NULL;
for (i = 0; i < fonts->getLength(); ++i) {
- font = (TextFontInfo *)fonts->get(i);
- if (font->matches(state)) {
+ curFont = (TextFontInfo *)fonts->get(i);
+ if (curFont->matches(state)) {
break;
}
- font = NULL;
+ curFont = NULL;
}
- if (!font) {
- font = new TextFontInfo(state);
- fonts->append(font);
+ if (!curFont) {
+ curFont = new TextFontInfo(state);
+ fonts->append(curFont);
}
// adjust the font size
gfxFont = state->getFont();
- fontSize = state->getTransformedFontSize();
+ curFontSize = state->getTransformedFontSize();
if (gfxFont && gfxFont->getType() == fontType3) {
// This is a hack which makes it possible to deal with some Type 3
// fonts. The problem is that it's impossible to know what the
@@ -496,24 +1724,28 @@ void TextPage::updateFont(GfxState *state) {
if (mCode >= 0 &&
(w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
// 0.6 is a generic average 'm' width -- yes, this is a hack
- fontSize *= w / 0.6;
+ curFontSize *= w / 0.6;
} else if (letterCode >= 0 &&
(w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
// even more of a hack: 0.5 is a generic letter width
- fontSize *= w / 0.5;
+ curFontSize *= w / 0.5;
} else if (anyCode >= 0 &&
(w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
// better than nothing: 0.5 is a generic character width
- fontSize *= w / 0.5;
+ curFontSize *= w / 0.5;
}
fm = gfxFont->getFontMatrix();
if (fm[0] != 0) {
- fontSize *= fabs(fm[3] / fm[0]);
+ curFontSize *= fabs(fm[3] / fm[0]);
}
}
}
void TextPage::beginWord(GfxState *state, double x0, double y0) {
+ double *txtm, *ctm, *fontm;
+ double m[4], m2[4];
+ int rot;
+
// This check is needed because Type 3 characters can contain
// text-drawing operations (when TextPage is being used via
// XOutputDev rather than TextOutputDev).
@@ -522,7 +1754,31 @@ void TextPage::beginWord(GfxState *state, double x0, double y0) {
return;
}
- curWord = new TextWord(state, x0, y0, charPos, font, fontSize);
+ // compute the rotation
+ txtm = state->getTextMat();
+ ctm = state->getCTM();
+ m[0] = txtm[0] * ctm[0] + txtm[1] * ctm[2];
+ m[1] = txtm[0] * ctm[1] + txtm[1] * ctm[3];
+ m[2] = txtm[2] * ctm[0] + txtm[3] * ctm[2];
+ m[3] = txtm[2] * ctm[1] + txtm[3] * ctm[3];
+ if (state->getFont()->getType() == fontType3) {
+ fontm = state->getFont()->getFontMatrix();
+ m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
+ m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
+ m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
+ m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
+ m[0] = m2[0];
+ m[1] = m2[1];
+ m[2] = m2[2];
+ m[3] = m2[3];
+ }
+ if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
+ rot = (m[3] < 0) ? 0 : 2;
+ } else {
+ rot = (m[2] > 0) ? 1 : 3;
+ }
+
+ curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
}
void TextPage::addChar(GfxState *state, double x, double y,
@@ -557,7 +1813,7 @@ void TextPage::addChar(GfxState *state, double x, double y,
// check the tiny chars limit
if (!globalParams->getTextKeepTinyChars() &&
fabs(w1) < 3 && fabs(h1) < 3) {
- if (++nTinyChars > 20000) {
+ if (++nTinyChars > 50000) {
return;
}
}
@@ -574,16 +1830,26 @@ void TextPage::addChar(GfxState *state, double x, double y,
// this case, break text into individual chars and let the coalesce
// function deal with it later
n = curWord->len;
- if (n > 0 && x1 - curWord->xRight[n-1] >
- curWord->font->minSpaceWidth * curWord->fontSize) {
- endWord();
- beginWord(state, x, y);
+ if (n > 0) {
+ switch (curWord->rot) {
+ case 0: sp = x1 - curWord->xMax; break;
+ case 1: sp = y1 - curWord->yMax; break;
+ case 2: sp = curWord->xMin - x1; break;
+ case 3: sp = curWord->yMin - y1; break;
+ }
+ if (sp > defaultSpaceWidth * curWord->fontSize) {
+ endWord();
+ beginWord(state, x, y);
+ }
}
// page rotation and/or transform matrices can cause text to be
// drawn in reverse order -- in this case, swap the begin/end
// coordinates and break text into individual chars
- if (w1 < 0) {
+ if ((curWord->rot == 0 && w1 < 0) ||
+ (curWord->rot == 1 && h1 < 0) ||
+ (curWord->rot == 2 && w1 > 0) ||
+ (curWord->rot == 3 && h1 > 0)) {
endWord();
beginWord(state, x + dx, y + dy);
x1 += w1;
@@ -620,8 +1886,6 @@ void TextPage::endWord() {
}
void TextPage::addWord(TextWord *word) {
- TextWord *p1, *p2;
-
// throw away zero-length words -- they don't have valid xMin/xMax
// values, and they're useless anyway
if (word->len == 0) {
@@ -629,533 +1893,399 @@ void TextPage::addWord(TextWord *word) {
return;
}
- // insert word in xy list
if (rawOrder) {
- p1 = wordPtr;
- p2 = NULL;
- } else {
- if (wordPtr && wordPtr->xyBefore(word)) {
- p1 = wordPtr;
- p2 = wordPtr->next;
+ if (rawLastWord) {
+ rawLastWord->next = word;
} else {
- p1 = NULL;
- p2 = words;
- }
- for (; p2; p1 = p2, p2 = p2->next) {
- if (word->xyBefore(p2)) {
- break;
- }
+ rawWords = word;
}
- }
- if (p1) {
- p1->next = word;
+ rawLastWord = word;
} else {
- words = word;
+ pools[word->rot]->addWord(word);
}
- word->next = p2;
- wordPtr = word;
}
void TextPage::coalesce(GBool physLayout) {
+ UnicodeMap *uMap;
+ TextPool *pool;
TextWord *word0, *word1, *word2;
- TextLine *line0, *line1, *line2, *line3, *line4, *lineList;
- TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6;
- TextBlock *yxBlocks, *blocks, *blkStack;
- TextFlow *flow0, *flow1;
- double sz, xLimit, yLimit;
- double fit1, fit2, sp1, sp2 = 0.0e+0;
+ TextLine *line;
+ TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
+ TextBlock **blkArray;
+ TextFlow *flow, *lastFlow;
+ int rot, poolMinBaseIdx, baseIdx, startBaseIdx;
+ double minBase, maxBase, newMinBase, newMaxBase;
+ double fontSize, colSpace, lineSpace, intraLineSpace, blkSpace;
GBool found;
- UnicodeMap *uMap;
- GBool isUnicode;
- char buf[8];
- int col1, col2, d, i, j;
+ int count[4];
+ int lrCount;
+ int firstBlkIdx, nBlocksLeft;
+ int col1, col2;
+ int i, j, n;
-#if 0 // for debugging
- printf("*** initial word list ***\n");
- for (word0 = words; word0; word0 = word0->next) {
- printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
- word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
- for (i = 0; i < word0->len; ++i) {
- fputc(word0->text[i] & 0xff, stdout);
- }
- printf("'\n");
+ if (rawOrder) {
+ primaryRot = 0;
+ primaryLR = gTrue;
+ return;
}
- printf("\n");
- fflush(stdout);
-#endif
- //----- discard duplicated text (fake boldface, drop shadows)
-
- word0 = words;
- while (word0) {
- sz = word0->fontSize;
- xLimit = word0->xMin + sz * dupMaxDeltaX;
- found = gFalse;
- for (word1 = word0, word2 = word0->next;
- word2 && word2->xMin < xLimit;
- word1 = word2, word2 = word2->next) {
- if (word2->len == word0->len &&
- !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) &&
- fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY &&
- fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY &&
- fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) {
- found = gTrue;
- break;
- }
- }
- if (found) {
- word1->next = word2->next;
- delete word2;
- } else {
- word0 = word0->next;
- }
- }
+ uMap = globalParams->getTextEncoding();
+ blkList = NULL;
+ lastBlk = NULL;
+ nBlocks = 0;
+ primaryRot = -1;
#if 0 // for debugging
- printf("*** words after removing duplicate text ***\n");
- for (word0 = words; word0; word0 = word0->next) {
- printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
- word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
- for (i = 0; i < word0->len; ++i) {
- fputc(word0->text[i] & 0xff, stdout);
- }
- printf("'\n");
- }
- printf("\n");
- fflush(stdout);
-#endif
-
- //----- merge words
-
- word0 = words;
- while (word0) {
- sz = word0->fontSize;
-
- // look for adjacent text which is part of the same word, and
- // merge it into this word
- xLimit = word0->xMax + sz * word0->font->minSpaceWidth;
- if (rawOrder) {
- word1 = word0;
- word2 = word0->next;
- found = word2 &&
- word2->xMin < xLimit &&
- word2->font == word0->font &&
- fabs(word2->fontSize - sz) < 0.05 &&
- fabs(word2->yBase - word0->yBase) < 0.05 &&
- word2->charPos == word0->charPos + word0->charLen;
- } else {
- found = gFalse;
- for (word1 = word0, word2 = word0->next;
- word2 && word2->xMin < xLimit;
- word1 = word2, word2 = word2->next) {
- if (word2->font == word0->font &&
- fabs(word2->fontSize - sz) < 0.05 &&
- fabs(word2->yBase - word0->yBase) < 0.05 &&
- word2->charPos == word0->charPos + word0->charLen) {
- found = gTrue;
- break;
+ printf("*** initial words ***\n");
+ for (rot = 0; rot < 4; ++rot) {
+ pool = pools[rot];
+ for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
+ for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->base, word0->fontSize);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
}
+ printf("'\n");
}
}
- if (found) {
- word0->merge(word2);
- word1->next = word2->next;
- delete word2;
- continue;
- }
-
- word0 = word0->next;
- }
-
-#if 0 // for debugging
- printf("*** after merging words ***\n");
- for (word0 = words; word0; word0 = word0->next) {
- printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
- word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
- for (i = 0; i < word0->len; ++i) {
- fputc(word0->text[i] & 0xff, stdout);
- }
- printf("'\n");
}
printf("\n");
- fflush(stdout);
#endif
- //----- assemble words into lines
-
- lineList = line0 = NULL;
- while (words) {
-
- // remove the first word from the word list
- word0 = words;
- words = words->next;
- word0->next = NULL;
-
- // find the best line (if any) for the word
- if (rawOrder) {
- if (line0 && lineFit(line0, word0, &sp2) >= 0) {
- line1 = line0;
- sp1 = sp2;
- } else {
- line1 = NULL;
- sp1 = 0;
- }
- } else {
- line1 = NULL;
- fit1 = 0;
- sp1 = 0;
- for (line2 = lineList; line2; line2 = line2->next) {
- fit2 = lineFit(line2, word0, &sp2);
- if (fit2 >= 0 && (!line1 || fit2 < fit1)) {
- line1 = line2;
- fit1 = fit2;
- sp1 = sp2;
- }
- }
- }
-
- // found a line: append the word
- if (line1) {
- word1 = line1->lastWord;
- word1->next = word0;
- line1->lastWord = word0;
- if (word0->xMax > line1->xMax) {
- line1->xMax = word0->xMax;
- }
- if (word0->yMin < line1->yMin) {
- line1->yMin = word0->yMin;
- }
- if (word0->yMax > line1->yMax) {
- line1->yMax = word0->yMax;
- }
- line1->len += word0->len;
- if (sp1 > line1->fontSize * line1->font->minSpaceWidth) {
- word1->spaceAfter = gTrue;
- ++line1->len;
- }
-
- // didn't find a line: create a new line
- } else {
- line1 = new TextLine();
- line1->words = line1->lastWord = word0;
- line1->xMin = word0->xMin;
- line1->xMax = word0->xMax;
- line1->yMin = word0->yMin;
- line1->yMax = word0->yMax;
- line1->yBase = word0->yBase;
- line1->font = word0->font;
- line1->fontSize = word0->fontSize;
- line1->len = word0->len;
- if (line0) {
- line0->next = line1;
- } else {
- lineList = line1;
- }
- line0 = line1;
- }
- }
-
- // build the line text
- uMap = globalParams->getTextEncoding();
- isUnicode = uMap ? uMap->isUnicode() : gFalse;
-
- for (line1 = lineList; line1; line1 = line1->next) {
- line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode));
- line1->xRight = (double *)gmalloc(line1->len * sizeof(double));
- line1->col = (int *)gmalloc(line1->len * sizeof(int));
- i = 0;
- for (word1 = line1->words; word1; word1 = word1->next) {
- for (j = 0; j < word1->len; ++j) {
- line1->text[i] = word1->text[j];
- line1->xRight[i] = word1->xRight[j];
- ++i;
- }
- if (word1->spaceAfter && word1->next) {
- line1->text[i] = (Unicode)0x0020;
- line1->xRight[i] = word1->next->xMin;
- ++i;
- }
- }
- line1->convertedLen = 0;
- for (j = 0; j < line1->len; ++j) {
- line1->col[j] = line1->convertedLen;
- if (isUnicode) {
- ++line1->convertedLen;
- } else if (uMap) {
- line1->convertedLen +=
- uMap->mapUnicode(line1->text[j], buf, sizeof(buf));
- }
- }
+ //----- assemble the blocks
- // check for hyphen at end of line
- //~ need to check for other chars used as hyphens
- if (line1->text[line1->len - 1] == (Unicode)'-') {
- line1->hyphenated = gTrue;
- }
+ //~ add an outer loop for writing mode (vertical text)
- }
+ // build blocks for each rotation value
+ for (rot = 0; rot < 4; ++rot) {
+ pool = pools[rot];
+ poolMinBaseIdx = pool->minBaseIdx;
+ count[rot] = 0;
- if (uMap) {
- uMap->decRefCnt();
- }
+ // add blocks until no more words are left
+ while (1) {
-#if 0 // for debugging
- printf("*** lines in xy order ***\n");
- for (line0 = lineList; line0; line0 = line0->next) {
- printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
- line0->xMin, line0->xMax, line0->yMin, line0->yMax,
- line0->yBase, line0->len);
- for (word0 = line0->words; word0; word0 = word0->next) {
- printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
- word0->xMin, word0->xMax, word0->yMin, word0->yMax,
- word0->yBase, word0->fontSize, word0->spaceAfter);
- for (i = 0; i < word0->len; ++i) {
- fputc(word0->text[i] & 0xff, stdout);
+ // find the first non-empty line in the pool
+ for (;
+ poolMinBaseIdx <= pool->maxBaseIdx &&
+ !pool->getPool(poolMinBaseIdx);
+ ++poolMinBaseIdx) ;
+ if (poolMinBaseIdx > pool->maxBaseIdx) {
+ break;
}
- printf("'\n");
- }
- }
- printf("\n");
- fflush(stdout);
-#endif
-
- //----- column assignment
- for (line1 = lineList; line1; line1 = line1->next) {
- col1 = 0;
- for (line2 = lineList; line2 != line1; line2 = line2->next) {
- if (line1->xMin >= line2->xMax) {
- d = (int)((line1->xMin - line2->xMax) /
- (line1->font->maxSpaceWidth * line1->fontSize));
- if (d > 4) {
- d = 4;
+ // look for the left-most word in the first four lines of the
+ // pool -- this avoids starting with a superscript word
+ startBaseIdx = poolMinBaseIdx;
+ for (baseIdx = poolMinBaseIdx + 1;
+ baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
+ ++baseIdx) {
+ if (!pool->getPool(baseIdx)) {
+ continue;
}
- col2 = line2->col[0] + line2->convertedLen + d;
- if (col2 > col1) {
- col1 = col2;
- }
- } else if (line1->xMin > line2->xMin) {
- for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ;
- col2 = line2->col[i];
- if (col2 > col1) {
- col1 = col2;
+ if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
+ < 0) {
+ startBaseIdx = baseIdx;
}
}
- }
- for (j = 0; j < line1->len; ++j) {
- line1->col[j] += col1;
- }
- }
-#if 0 // for debugging
- printf("*** lines in xy order, after column assignment ***\n");
- for (line0 = lineList; line0; line0 = line0->next) {
- printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f col=%d len=%d]\n",
- line0->xMin, line0->xMax, line0->yMin, line0->yMax,
- line0->yBase, line0->col[0], line0->len);
- for (word0 = line0->words; word0; word0 = word0->next) {
- printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
- word0->xMin, word0->xMax, word0->yMin, word0->yMax,
- word0->yBase, word0->fontSize, word0->spaceAfter);
- for (i = 0; i < word0->len; ++i) {
- fputc(word0->text[i] & 0xff, stdout);
- }
- printf("'\n");
- }
- }
- printf("\n");
- fflush(stdout);
-#endif
-
- //----- assemble lines into blocks
-
- if (rawOrder) {
-
- lines = lineList;
- for (line1 = lines; line1; line1 = line1->next) {
- line1->xSpaceL = 0;
- line1->xSpaceR = pageWidth;
- }
-
- } else {
-
- // sort lines into yx order
- lines = NULL;
- while (lineList) {
- line0 = lineList;
- lineList = lineList->next;
- for (line1 = NULL, line2 = lines;
- line2 && !line0->yxBefore(line2);
- line1 = line2, line2 = line2->next) ;
- if (line1) {
- line1->next = line0;
- } else {
- lines = line0;
- }
- line0->next = line2;
- }
-
- // compute whitespace to left and right of each line
- line0 = lines;
- for (line1 = lines; line1; line1 = line1->next) {
-
- // find the first vertically overlapping line
- for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ;
-
- // check each vertically overlapping line -- look for the nearest
- // on each side
- line1->xSpaceL = 0;
- line1->xSpaceR = pageWidth;
- for (line2 = line0;
- line2 && line2->yMin < line1->yMax;
- line2 = line2->next) {
- if (line2->yMax > line1->yMin) {
- if (line2->xMax < line1->xMin) {
- if (line2->xMax > line1->xSpaceL) {
- line1->xSpaceL = line2->xMax;
- }
- } else if (line2->xMin > line1->xMax) {
- if (line2->xMin < line1->xSpaceR) {
- line1->xSpaceR = line2->xMin;
+ // create a new block
+ word0 = pool->getPool(startBaseIdx);
+ pool->setPool(startBaseIdx, word0->next);
+ word0->next = NULL;
+ blk = new TextBlock(this, rot);
+ blk->addWord(word0);
+
+ fontSize = word0->fontSize;
+ minBase = maxBase = word0->base;
+ colSpace = minColSpacing * fontSize;
+ lineSpace = maxLineSpacingDelta * fontSize;
+ intraLineSpace = maxIntraLineDelta * fontSize;
+
+ // add words to the block
+ do {
+ found = gFalse;
+
+ // look for words on the line above the current top edge of
+ // the block
+ newMinBase = minBase;
+ for (baseIdx = pool->getBaseIdx(minBase);
+ baseIdx >= pool->getBaseIdx(minBase - lineSpace);
+ --baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base < minBase &&
+ word1->base >= minBase - lineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta1 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ found = gTrue;
+ newMinBase = word2->base;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
}
}
}
- }
- }
- } // (!rawOrder)
-
-#if 0 // for debugging
- printf("*** lines in yx order ***\n");
- for (line0 = lines; line0; line0 = line0->next) {
- printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n",
- line0->xMin, line0->xMax, line0->yMin, line0->yMax,
- line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len);
- for (word0 = line0->words; word0; word0 = word0->next) {
- printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
- word0->xMin, word0->xMax, word0->yMin, word0->yMax,
- word0->yBase, word0->fontSize, word0->spaceAfter);
- for (i = 0; i < word0->len; ++i) {
- fputc(word0->text[i] & 0xff, stdout);
- }
- printf("'\n");
- }
- }
- printf("\n");
- fflush(stdout);
-#endif
-
- lineList = lines;
- yxBlocks = NULL;
- blk0 = NULL;
- while (lineList) {
-
- // build a new block object
- line0 = lineList;
- lineList = lineList->next;
- line0->next = NULL;
- blk1 = new TextBlock();
- blk1->lines = line0;
- blk1->xMin = line0->xMin;
- blk1->xMax = line0->xMax;
- blk1->yMin = line0->yMin;
- blk1->yMax = line0->yMax;
- blk1->xSpaceL = line0->xSpaceL;
- blk1->xSpaceR = line0->xSpaceR;
- blk1->maxFontSize = line0->fontSize;
-
- // find subsequent lines in the block
- while (lineList) {
-
- // look for the first horizontally overlapping line below this
- // one
- yLimit = line0->yMax + blkMaxSpacing * line0->fontSize;
- line3 = line4 = NULL;
- if (rawOrder) {
- if (lineList->yMin < yLimit &&
- lineList->xMax > blk1->xMin &&
- lineList->xMin < blk1->xMax) {
- line3 = NULL;
- line4 = lineList;
+ minBase = newMinBase;
+
+ // look for words on the line below the current bottom edge of
+ // the block
+ newMaxBase = maxBase;
+ for (baseIdx = pool->getBaseIdx(maxBase);
+ baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
+ ++baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base > maxBase &&
+ word1->base <= maxBase + lineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta1 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ found = gTrue;
+ newMaxBase = word2->base;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
}
- } else {
- for (line1 = NULL, line2 = lineList;
- line2 && line2->yMin < yLimit;
- line1 = line2, line2 = line2->next) {
- if (line2->xMax > blk1->xMin &&
- line2->xMin < blk1->xMax) {
- line3 = line1;
- line4 = line2;
- break;
+ maxBase = newMaxBase;
+
+ // look for words that are on lines already in the block, and
+ // that overlap the block horizontally
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
+ ++baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base >= minBase - intraLineSpace &&
+ word1->base <= maxBase + intraLineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
+ : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta2 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ found = gTrue;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
}
}
- }
- // if there is an overlapping line and it fits in the block, add
- // it to the block
- if (line4 && blockFit(blk1, line4)) {
- if (line3) {
- line3->next = line4->next;
- } else {
- lineList = line4->next;
- }
- line0->next = line0->flowNext = line4;
- line4->next = NULL;
- if (line4->xMin < blk1->xMin) {
- blk1->xMin = line4->xMin;
- } else if (line4->xMax > blk1->xMax) {
- blk1->xMax = line4->xMax;
+ // only check for outlying words (the next two chunks of code)
+ // if we didn't find anything else
+ if (found) {
+ continue;
}
- if (line4->yMax > blk1->yMax) {
- blk1->yMax = line4->yMax;
+
+ // scan down the left side of the block, looking for words
+ // that are near (but not overlapping) the block; if there are
+ // three or fewer, add them to the block
+ n = 0;
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
+ ++baseIdx) {
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base >= minBase - intraLineSpace &&
+ word1->base <= maxBase + intraLineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMax <= blk->xMin &&
+ word1->xMax > blk->xMin - colSpace)
+ : (word1->yMax <= blk->yMin &&
+ word1->yMax > blk->yMin - colSpace)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta3 * fontSize) {
+ ++n;
+ break;
+ }
+ word1 = word1->next;
+ }
}
- if (line4->xSpaceL > blk1->xSpaceL) {
- blk1->xSpaceL = line4->xSpaceL;
+ if (n > 0 && n <= 3) {
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
+ ++baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base >= minBase - intraLineSpace &&
+ word1->base <= maxBase + intraLineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMax <= blk->xMin &&
+ word1->xMax > blk->xMin - colSpace)
+ : (word1->yMax <= blk->yMin &&
+ word1->yMax > blk->yMin - colSpace)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta3 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ if (word2->base < minBase) {
+ minBase = word2->base;
+ } else if (word2->base > maxBase) {
+ maxBase = word2->base;
+ }
+ found = gTrue;
+ break;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
+ }
}
- if (line4->xSpaceR < blk1->xSpaceR) {
- blk1->xSpaceR = line4->xSpaceR;
+
+ // scan down the right side of the block, looking for words
+ // that are near (but not overlapping) the block; if there are
+ // three or fewer, add them to the block
+ n = 0;
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
+ ++baseIdx) {
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base >= minBase - intraLineSpace &&
+ word1->base <= maxBase + intraLineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin >= blk->xMax &&
+ word1->xMin < blk->xMax + colSpace)
+ : (word1->yMin >= blk->yMax &&
+ word1->yMin < blk->yMax + colSpace)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta3 * fontSize) {
+ ++n;
+ break;
+ }
+ word1 = word1->next;
+ }
}
- if (line4->fontSize > blk1->maxFontSize) {
- blk1->maxFontSize = line4->fontSize;
+ if (n > 0 && n <= 3) {
+ for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
+ baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
+ ++baseIdx) {
+ word0 = NULL;
+ word1 = pool->getPool(baseIdx);
+ while (word1) {
+ if (word1->base >= minBase - intraLineSpace &&
+ word1->base <= maxBase + intraLineSpace &&
+ ((rot == 0 || rot == 2)
+ ? (word1->xMin >= blk->xMax &&
+ word1->xMin < blk->xMax + colSpace)
+ : (word1->yMin >= blk->yMax &&
+ word1->yMin < blk->yMax + colSpace)) &&
+ fabs(word1->fontSize - fontSize) <
+ maxBlockFontSizeDelta3 * fontSize) {
+ word2 = word1;
+ if (word0) {
+ word0->next = word1->next;
+ } else {
+ pool->setPool(baseIdx, word1->next);
+ }
+ word1 = word1->next;
+ word2->next = NULL;
+ blk->addWord(word2);
+ if (word2->base < minBase) {
+ minBase = word2->base;
+ } else if (word2->base > maxBase) {
+ maxBase = word2->base;
+ }
+ found = gTrue;
+ break;
+ } else {
+ word0 = word1;
+ word1 = word1->next;
+ }
+ }
+ }
}
- line0 = line4;
- // otherwise, we're done with this block
+ } while (found);
+
+ //~ need to compute the primary writing mode (horiz/vert) in
+ //~ addition to primary rotation
+
+ // coalesce the block, and add it to the list
+ blk->coalesce(uMap);
+ if (lastBlk) {
+ lastBlk->next = blk;
} else {
- break;
+ blkList = blk;
+ }
+ lastBlk = blk;
+ count[rot] += blk->charCount;
+ if (primaryRot < 0 || count[rot] > count[primaryRot]) {
+ primaryRot = rot;
}
+ ++nBlocks;
}
+ }
- // insert block on list, in yx order
- if (rawOrder) {
- blk2 = blk0;
- blk3 = NULL;
- blk0 = blk1;
- } else {
- for (blk2 = NULL, blk3 = yxBlocks;
- blk3 && !blk1->yxBefore(blk3);
- blk2 = blk3, blk3 = blk3->next) ;
- }
- blk1->next = blk3;
- if (blk2) {
- blk2->next = blk1;
- } else {
- yxBlocks = blk1;
- }
+#if 0 // for debugging
+ printf("*** rotation ***\n");
+ for (rot = 0; rot < 4; ++rot) {
+ printf(" %d: %6d\n", rot, count[rot]);
}
+ printf(" primary rot = %d\n", primaryRot);
+ printf("\n");
+#endif
#if 0 // for debugging
- printf("*** blocks in yx order ***\n");
- for (blk0 = yxBlocks; blk0; blk0 = blk0->next) {
- printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
- blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
- for (line0 = blk0->lines; line0; line0 = line0->next) {
- printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
- line0->xMin, line0->xMax, line0->yMin, line0->yMax,
- line0->yBase, line0->len);
- for (word0 = line0->words; word0; word0 = word0->next) {
- printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ printf("*** blocks ***\n");
+ for (blk = blkList; blk; blk = blk->next) {
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
+ for (line = blk->lines; line; line = line->next) {
+ printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
+ line->xMin, line->xMax, line->yMin, line->yMax, line->base);
+ for (word0 = line->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
word0->xMin, word0->xMax, word0->yMin, word0->yMax,
- word0->yBase, word0->spaceAfter);
+ word0->base, word0->fontSize, word0->spaceAfter);
for (i = 0; i < word0->len; ++i) {
fputc(word0->text[i] & 0xff, stdout);
}
@@ -1164,148 +2294,109 @@ void TextPage::coalesce(GBool physLayout) {
}
}
printf("\n");
- fflush(stdout);
#endif
- //----- merge lines and blocks, sort blocks into reading order
-
- if (rawOrder) {
- blocks = yxBlocks;
-
- } else {
- blocks = NULL;
- blk0 = NULL;
- blkStack = NULL;
- while (yxBlocks) {
-
- // find the next two blocks:
- // - if the depth-first traversal stack is empty, take the first
- // (upper-left-most) two blocks on the yx-sorted block list
- // - otherwise, find the two upper-left-most blocks under the top
- // block on the stack
- if (blkStack) {
- blk3 = blk4 = blk5 = blk6 = NULL;
- for (blk1 = NULL, blk2 = yxBlocks;
- blk2;
- blk1 = blk2, blk2 = blk2->next) {
- if (blk2->yMin > blkStack->yMin &&
- blk2->xMax > blkStack->xMin &&
- blk2->xMin < blkStack->xMax) {
- if (!blk4 || blk2->yxBefore(blk4)) {
- blk5 = blk3;
- blk6 = blk4;
- blk3 = blk1;
- blk4 = blk2;
- } else if (!blk6 || blk2->yxBefore(blk6)) {
- blk5 = blk1;
- blk6 = blk2;
- }
+ // determine the primary direction
+ lrCount = 0;
+ for (blk = blkList; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ for (word0 = line->words; word0; word0 = word0->next) {
+ for (i = 0; i < word0->len; ++i) {
+ if (unicodeTypeL(word0->text[i])) {
+ ++lrCount;
+ } else if (unicodeTypeR(word0->text[i])) {
+ --lrCount;
}
}
- } else {
- blk3 = NULL;
- blk4 = yxBlocks;
- blk5 = yxBlocks;
- blk6 = yxBlocks->next;
}
+ }
+ }
+ primaryLR = lrCount >= 0;
- // merge case 1:
- // | | |
- // | blkStack | | blkStack
- // +---------------------+ --> +--------------
- // +------+ +------+ +-----------+
- // | blk4 | | blk6 | ... | blk4+blk6 |
- // +------+ +------+ +-----------+
- yLimit = 0; // make gcc happy
- if (blkStack) {
- yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize;
- }
- if (blkStack && blk4 && blk6 &&
- !blk4->lines->next && !blk6->lines->next &&
- lineFit2(blk4->lines, blk6->lines) &&
- blk4->yMin < yLimit &&
- blk4->xMin > blkStack->xSpaceL &&
- blkStack->xMin > blk4->xSpaceL &&
- blk6->xMax < blkStack->xSpaceR) {
- blk4->mergeRight(blk6);
- if (blk5) {
- blk5->next = blk6->next;
+#if 0 // for debugging
+ printf("*** direction ***\n");
+ printf("lrCount = %d\n", lrCount);
+ printf("primaryLR = %d\n", primaryLR);
+#endif
+
+ //----- column assignment
+
+ // sort blocks into xy order for column assignment
+ blocks = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *));
+ for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
+ blocks[i] = blk;
+ }
+ qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
+
+ // column assignment
+ for (i = 0; i < nBlocks; ++i) {
+ blk0 = blocks[i];
+ col1 = 0;
+ for (j = 0; j < i; ++j) {
+ blk1 = blocks[j];
+ col2 = 0; // make gcc happy
+ switch (primaryRot) {
+ case 0:
+ if (blk0->xMin > blk1->xMax) {
+ col2 = blk1->col + blk1->nColumns + 3;
} else {
- yxBlocks = blk6->next;
+ col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
+ (blk1->xMax - blk1->xMin)) *
+ blk1->nColumns);
}
- delete blk6;
-
- // merge case 2:
- // | | | |
- // | blkStack | | |
- // +---------------------+ --> | blkStack+blk2 |
- // +---------------------+ | |
- // | blk4 | | |
- // | | | |
- } else if (blkStack && blk4 &&
- blk4->yMin < yLimit &&
- blockFit2(blkStack, blk4)) {
- blkStack->mergeBelow(blk4);
- if (blk3) {
- blk3->next = blk4->next;
+ break;
+ case 1:
+ if (blk0->yMin > blk1->yMax) {
+ col2 = blk1->col + blk1->nColumns + 3;
} else {
- yxBlocks = blk4->next;
+ col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
+ (blk1->yMax - blk1->yMin)) *
+ blk1->nColumns);
}
- delete blk4;
-
- // if any of:
- // 1. no block found
- // 2. non-fully overlapping block found
- // 3. large vertical gap above the overlapping block
- // then pop the stack and try again
- } else if (!blk4 ||
- (blkStack && (blk4->xMin < blkStack->xSpaceL ||
- blk4->xMax > blkStack->xSpaceR ||
- blk4->yMin - blkStack->yMax >
- blkMaxSortSpacing * blkStack->maxFontSize))) {
- blkStack = blkStack->stackNext;
-
- // add a block to the sorted list
- } else {
-
- // remove the block from the yx-sorted list
- if (blk3) {
- blk3->next = blk4->next;
+ break;
+ case 2:
+ if (blk0->xMax < blk1->xMin) {
+ col2 = blk1->col + blk1->nColumns + 3;
} else {
- yxBlocks = blk4->next;
+ col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
+ (blk1->xMin - blk1->xMax)) *
+ blk1->nColumns);
}
- blk4->next = NULL;
-
- // append the block to the reading-order list
- if (blk0) {
- blk0->next = blk4;
+ break;
+ case 3:
+ if (blk0->yMax < blk1->yMin) {
+ col2 = blk1->col + blk1->nColumns + 3;
} else {
- blocks = blk4;
- }
- blk0 = blk4;
-
- // push the block on the traversal stack
- if (!physLayout) {
- blk4->stackNext = blkStack;
- blkStack = blk4;
+ col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
+ (blk1->yMin - blk1->yMax)) *
+ blk1->nColumns);
}
+ break;
+ }
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ blk0->col = col1;
+ for (line = blk0->lines; line; line = line->next) {
+ for (j = 0; j <= line->len; ++j) {
+ line->col[j] += col1;
}
}
- } // (!rawOrder)
+ }
#if 0 // for debugging
- printf("*** blocks in reading order (after merging) ***\n");
- for (blk0 = blocks; blk0; blk0 = blk0->next) {
- printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
- blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
- for (line0 = blk0->lines; line0; line0 = line0->next) {
- printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
- line0->xMin, line0->xMax, line0->yMin, line0->yMax,
- line0->yBase, line0->len);
- for (word0 = line0->words; word0; word0 = word0->next) {
- printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ printf("*** blocks, after column assignment ***\n");
+ for (blk = blkList; blk; blk = blk->next) {
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
+ blk->nColumns);
+ for (line = blk->lines; line; line = line->next) {
+ printf(" line:\n");
+ for (word0 = line->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
word0->xMin, word0->xMax, word0->yMin, word0->yMax,
- word0->yBase, word0->spaceAfter);
+ word0->base, word0->fontSize, word0->spaceAfter);
for (i = 0; i < word0->len; ++i) {
fputc(word0->text[i] & 0xff, stdout);
}
@@ -1314,118 +2405,152 @@ void TextPage::coalesce(GBool physLayout) {
}
}
printf("\n");
- fflush(stdout);
#endif
- //----- assemble blocks into flows
+ //----- reading order sort
- if (rawOrder) {
+ // sort blocks into yx order (in preparation for reading order sort)
+ qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
- // one flow per block
- flow0 = NULL;
- while (blocks) {
- flow1 = new TextFlow();
- flow1->blocks = blocks;
- flow1->lines = blocks->lines;
- flow1->yMin = blocks->yMin;
- flow1->yMax = blocks->yMax;
- blocks = blocks->next;
- flow1->blocks->next = NULL;
- if (flow0) {
- flow0->next = flow1;
- } else {
- flows = flow1;
+ // compute space on left and right sides of each block
+ for (i = 0; i < nBlocks; ++i) {
+ blk0 = blocks[i];
+ for (j = 0; j < nBlocks; ++j) {
+ blk1 = blocks[j];
+ if (blk1 != blk0) {
+ blk0->updatePriMinMax(blk1);
}
- flow0 = flow1;
}
+ }
- } else {
-
- // compute whitespace above and below each block
- for (blk0 = blocks; blk0; blk0 = blk0->next) {
- blk0->ySpaceT = 0;
- blk0->ySpaceB = pageHeight;
-
- // check each horizontally overlapping block
- for (blk1 = blocks; blk1; blk1 = blk1->next) {
- if (blk1 != blk0 &&
- blk1->xMin < blk0->xMax &&
- blk1->xMax > blk0->xMin) {
- if (blk1->yMax < blk0->yMin) {
- if (blk1->yMax > blk0->ySpaceT) {
- blk0->ySpaceT = blk1->yMax;
- }
- } else if (blk1->yMin > blk0->yMax) {
- if (blk1->yMin < blk0->ySpaceB) {
- blk0->ySpaceB = blk1->yMin;
- }
- }
+#if 0 // for debugging
+ printf("*** blocks, after yx sort ***\n");
+ for (i = 0; i < nBlocks; ++i) {
+ blk = blocks[i];
+ printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
+ blk->priMin, blk->priMax);
+ for (line = blk->lines; line; line = line->next) {
+ printf(" line:\n");
+ for (word0 = line->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->base, word0->fontSize, word0->spaceAfter);
+ for (j = 0; j < word0->len; ++j) {
+ fputc(word0->text[j] & 0xff, stdout);
}
+ printf("'\n");
}
}
+ }
+ printf("\n");
+#endif
- flow0 = NULL;
- while (blocks) {
-
- // build a new flow object
- flow1 = new TextFlow();
- flow1->blocks = blocks;
- flow1->lines = blocks->lines;
- flow1->yMin = blocks->yMin;
- flow1->yMax = blocks->yMax;
- flow1->ySpaceT = blocks->ySpaceT;
- flow1->ySpaceB = blocks->ySpaceB;
-
- // find subsequent blocks in the flow
- for (blk1 = blocks, blk2 = blocks->next;
- blk2 && flowFit(flow1, blk2);
- blk1 = blk2, blk2 = blk2->next) {
- if (blk2->yMin < flow1->yMin) {
- flow1->yMin = blk2->yMin;
- }
- if (blk2->yMax > flow1->yMax) {
- flow1->yMax = blk2->yMax;
- }
- if (blk2->ySpaceT > flow1->ySpaceT) {
- flow1->ySpaceT = blk2->ySpaceT;
+ // build the flows
+ //~ this needs to be adjusted for writing mode (vertical text)
+ //~ this also needs to account for right-to-left column ordering
+ blkArray = (TextBlock **)gmalloc(nBlocks * sizeof(TextBlock *));
+ memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
+ flows = lastFlow = NULL;
+ firstBlkIdx = 0;
+ nBlocksLeft = nBlocks;
+ while (nBlocksLeft > 0) {
+
+ // find the upper-left-most block
+ for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
+ i = firstBlkIdx;
+ blk = blkArray[i];
+ for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
+ blk1 = blkArray[j];
+ if (blk1) {
+ if (blk && blk->secondaryDelta(blk1) > 0) {
+ break;
}
- if (blk2->ySpaceB < flow1->ySpaceB) {
- flow1->ySpaceB = blk2->ySpaceB;
+ if (blk1->primaryCmp(blk) < 0) {
+ i = j;
+ blk = blk1;
}
- for (line1 = blk1->lines; line1->next; line1 = line1->next) ;
- line1->flowNext = blk2->lines;
}
+ }
+ blkArray[i] = NULL;
+ --nBlocksLeft;
+ blk->next = NULL;
- // chop the block list
- blocks = blk1->next;
- blk1->next = NULL;
+ // create a new flow, starting with the upper-left-most block
+ flow = new TextFlow(this, blk);
+ if (lastFlow) {
+ lastFlow->next = flow;
+ } else {
+ flows = flow;
+ }
+ lastFlow = flow;
+ fontSize = blk->lines->words->fontSize;
+
+ // push the upper-left-most block on the stack
+ blk->stackNext = NULL;
+ blkStack = blk;
+
+ // find the other blocks in this flow
+ while (blkStack) {
+
+ // find the upper-left-most block under (but within
+ // maxBlockSpacing of) the top block on the stack
+ blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
+ blk = NULL;
+ i = -1;
+ for (j = firstBlkIdx; j < nBlocks; ++j) {
+ blk1 = blkArray[j];
+ if (blk1) {
+ if (blkStack->secondaryDelta(blk1) > blkSpace) {
+ break;
+ }
+ if (blk && blk->secondaryDelta(blk1) > 0) {
+ break;
+ }
+ if (blk1->isBelow(blkStack) &&
+ (!blk || blk1->primaryCmp(blk) < 0)) {
+ i = j;
+ blk = blk1;
+ }
+ }
+ }
- // append the flow to the list
- if (flow0) {
- flow0->next = flow1;
+ // if a suitable block was found, add it to the flow and push it
+ // onto the stack
+ if (blk && flow->blockFits(blk, blkStack)) {
+ blkArray[i] = NULL;
+ --nBlocksLeft;
+ blk->next = NULL;
+ flow->addBlock(blk);
+ fontSize = blk->lines->words->fontSize;
+ blk->stackNext = blkStack;
+ blkStack = blk;
+
+ // otherwise (if there is no block under the top block or the
+ // block is not suitable), pop the stack
} else {
- flows = flow1;
+ blkStack = blkStack->stackNext;
}
- flow0 = flow1;
}
}
+ gfree(blkArray);
#if 0 // for debugging
printf("*** flows ***\n");
- for (flow0 = flows; flow0; flow0 = flow0->next) {
- printf("[flow]\n");
- for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) {
- printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n",
- blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax,
- blk0->ySpaceT, blk0->ySpaceB);
- for (line0 = blk0->lines; line0; line0 = line0->next) {
- printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
- line0->xMin, line0->xMax, line0->yMin, line0->yMax,
- line0->yBase, line0->len);
- for (word0 = line0->words; word0; word0 = word0->next) {
- printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ for (flow = flows; flow; flow = flow->next) {
+ printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
+ flow->xMin, flow->xMax, flow->yMin, flow->yMax,
+ flow->priMin, flow->priMax);
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
+ blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
+ blk->priMin, blk->priMax);
+ for (line = blk->lines; line; line = line->next) {
+ printf(" line:\n");
+ for (word0 = line->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
word0->xMin, word0->xMax, word0->yMin, word0->yMax,
- word0->yBase, word0->spaceAfter);
+ word0->base, word0->fontSize, word0->spaceAfter);
for (i = 0; i < word0->len; ++i) {
fputc(word0->text[i] & 0xff, stdout);
}
@@ -1435,288 +2560,159 @@ void TextPage::coalesce(GBool physLayout) {
}
}
printf("\n");
- fflush(stdout);
#endif
- //----- sort lines into yx order
-
- // (the block/line merging process doesn't maintain the full-page
- // linked list of lines)
-
- lines = NULL;
- if (rawOrder) {
- line0 = NULL;
- for (flow0 = flows; flow0; flow0 = flow0->next) {
- for (line1 = flow0->lines; line1; line1 = line1->flowNext) {
- if (line0) {
- line0->pageNext = line1;
- } else {
- lines = line1;
- }
- line0 = line1;
- }
- }
- } else {
- for (flow0 = flows; flow0; flow0 = flow0->next) {
- for (line0 = flow0->lines; line0; line0 = line0->flowNext) {
- for (line1 = NULL, line2 = lines;
- line2 && !line0->yxBefore(line2);
- line1 = line2, line2 = line2->pageNext) ;
- if (line1) {
- line1->pageNext = line0;
- } else {
- lines = line0;
- }
- line0->pageNext = line2;
- }
- }
- }
-
-#if 0 // for debugging
- printf("*** lines in yx order ***\n");
- for (line0 = lines; line0; line0 = line0->pageNext) {
- printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n",
- line0->xMin, line0->xMax, line0->yMin, line0->yMax,
- line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0],
- line0->len);
- for (word0 = line0->words; word0; word0 = word0->next) {
- printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
- word0->xMin, word0->xMax, word0->yMin, word0->yMax,
- word0->yBase, word0->spaceAfter);
- for (i = 0; i < word0->len; ++i) {
- fputc(word0->text[i] & 0xff, stdout);
- }
- printf("'\n");
- }
- }
- printf("\n");
- fflush(stdout);
-#endif
-}
-
-// If <word> can be added the end of <line>, return the absolute value
-// of the difference between <line>'s baseline and <word>'s baseline,
-// and set *<space> to the horizontal space between the current last
-// word in <line> and <word>. A smaller return value indicates a
-// better fit. Otherwise, return a negative number.
-double TextPage::lineFit(TextLine *line, TextWord *word, double *space) {
- TextWord *lastWord;
- double fontSize0, fontSize1;
- double dx, dxLimit;
-
- lastWord = line->lastWord;
- fontSize0 = line->fontSize;
- fontSize1 = word->fontSize;
- dx = word->xMin - lastWord->xMax;
- dxLimit = fontSize0 * lastWord->font->maxSpaceWidth;
-
- // check inter-word spacing
- if (dx < fontSize0 * lineMinDeltaX ||
- dx > dxLimit) {
- return -1;
- }
-
- if (
- // look for adjacent words with close baselines and close font sizes
- (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 &&
- fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
- fontSize1 < lineMaxFontSizeRatio * fontSize0) ||
-
- // look for a superscript
- (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 &&
- fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 &&
- (word->yMax < lastWord->yMax ||
- word->yBase < lastWord->yBase) &&
- word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 &&
- dx < fontSize0 * lineMaxSuperscriptDeltaX) ||
-
- // look for a subscript
- (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 &&
- fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 &&
- (word->yMin > lastWord->yMin ||
- word->yBase > lastWord->yBase) &&
- line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 &&
- dx < fontSize0 * lineMaxSubscriptDeltaX)) {
-
- *space = dx;
- return fabs(word->yBase - line->yBase);
- }
-
- return -1;
-}
-
-// Returns true if <line0> and <line1> can be merged into a single
-// line, ignoring max word spacing.
-GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) {
- double fontSize0, fontSize1;
- double dx;
-
- fontSize0 = line0->fontSize;
- fontSize1 = line1->fontSize;
- dx = line1->xMin - line0->xMax;
-
- // check inter-word spacing
- if (dx < fontSize0 * lineMinDeltaX) {
- return gFalse;
- }
-
- // look for close baselines and close font sizes
- if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 &&
- fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
- fontSize1 < lineMaxFontSizeRatio * fontSize0) {
- return gTrue;
+ if (uMap) {
+ uMap->decRefCnt();
}
-
- return gFalse;
}
-// Returns true if <line> can be added to <blk>. Assumes the y
-// coordinates are within range.
-GBool TextPage::blockFit(TextBlock *blk, TextLine *line) {
- double fontSize0, fontSize1;
-
- // check edges
- if (line->xMin < blk->xSpaceL ||
- line->xMax > blk->xSpaceR ||
- blk->xMin < line->xSpaceL ||
- blk->xMax > line->xSpaceR) {
- return gFalse;
- }
-
- // check font sizes
- fontSize0 = blk->lines->fontSize;
- fontSize1 = line->fontSize;
- if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
- fontSize1 > blkMaxFontSizeRatio * fontSize0) {
- return gFalse;
- }
-
- return gTrue;
-}
+GBool TextPage::findText(Unicode *s, int len,
+ GBool startAtTop, GBool stopAtBottom,
+ GBool startAtLast, GBool stopAtLast,
+ double *xMin, double *yMin,
+ double *xMax, double *yMax) {
+ TextBlock *blk;
+ TextLine *line;
+ Unicode *p;
+ Unicode u1, u2;
+ int m, i, j, k;
+ double xStart, yStart, xStop, yStop;
+ double xMin0, yMin0, xMax0, yMax0;
+ double xMin1, yMin1, xMax1, yMax1;
+ GBool found;
-// Returns true if <blk0> and <blk1> can be merged into a single
-// block. Assumes the y coordinates are within range.
-GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) {
- double fontSize0, fontSize1;
+ //~ needs to handle right-to-left text
- // check edges
- if (blk1->xMin < blk0->xSpaceL ||
- blk1->xMax > blk0->xSpaceR ||
- blk0->xMin < blk1->xSpaceL ||
- blk0->xMax > blk1->xSpaceR) {
+ if (rawOrder) {
return gFalse;
}
- // check font sizes
- fontSize0 = blk0->lines->fontSize;
- fontSize1 = blk1->lines->fontSize;
- if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
- fontSize1 > blkMaxFontSizeRatio * fontSize0) {
- return gFalse;
+ xStart = yStart = xStop = yStop = 0;
+ if (startAtLast && haveLastFind) {
+ xStart = lastFindXMin;
+ yStart = lastFindYMin;
+ } else if (!startAtTop) {
+ xStart = *xMin;
+ yStart = *yMin;
}
-
- return gTrue;
-}
-
-// Returns true if <blk> can be added to <flow>.
-GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) {
- double dy;
-
- // check whitespace above and below
- if (blk->yMin < flow->ySpaceT ||
- blk->yMax > flow->ySpaceB ||
- flow->yMin < blk->ySpaceT ||
- flow->yMax > blk->ySpaceB) {
- return gFalse;
+ if (stopAtLast && haveLastFind) {
+ xStop = lastFindXMin;
+ yStop = lastFindYMin;
+ } else if (!stopAtBottom) {
+ xStop = *xMax;
+ yStop = *yMax;
}
- // check that block top edge is within +/- dy of flow top edge,
- // and that block bottom edge is above flow bottom edge + dy
- dy = flowMaxDeltaY * flow->blocks->maxFontSize;
- return blk->yMin > flow->yMin - dy &&
- blk->yMin < flow->yMin + dy &&
- blk->yMax < flow->yMax + dy;
-}
+ found = gFalse;
+ xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
+ xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
+ for (i = 0; i < nBlocks; ++i) {
+ blk = blocks[i];
-GBool TextPage::findText(Unicode *s, int len,
- GBool top, GBool bottom,
- double *xMin, double *yMin,
- double *xMax, double *yMax) {
- TextLine *line;
- Unicode *p;
- Unicode u1, u2;
- int m, i, j;
- double x0, x1, x;
-
- // scan all text on the page
- for (line = lines; line; line = line->pageNext) {
-
- // check: above top limit?
- if (!top && (line->yMax < *yMin ||
- (line->yMin < *yMin && line->xMax <= *xMin))) {
+ // check: is the block above the top limit?
+ if (!startAtTop && blk->yMax < yStart) {
continue;
}
- // check: below bottom limit?
- if (!bottom && (line->yMin > *yMax ||
- (line->yMax > *yMax && line->xMin >= *xMax))) {
- return gFalse;
+ // check: is the block below the bottom limit?
+ if (!stopAtBottom && blk->yMin > yStop) {
+ break;
}
- // search each position in this line
- m = line->len;
- for (i = 0, p = line->text; i <= m - len; ++i, ++p) {
-
- x0 = (i == 0) ? line->xMin : line->xRight[i-1];
- x1 = line->xRight[i];
- x = 0.5 * (x0 + x1);
+ for (line = blk->lines; line; line = line->next) {
- // check: above top limit?
- if (!top && line->yMin < *yMin) {
- if (x < *xMin) {
- continue;
- }
+ // check: is the line above the top limit?
+ if (!startAtTop && line->yMin < yStart) {
+ continue;
}
- // check: below bottom limit?
- if (!bottom && line->yMax > *yMax) {
- if (x > *xMax) {
- return gFalse;
- }
+ // check: is the line below the bottom limit?
+ if (!stopAtBottom && line->yMin > yStop) {
+ continue;
}
- // compare the strings
- for (j = 0; j < len; ++j) {
+ // search each position in this line
+ m = line->len;
+ for (j = 0, p = line->text; j <= m - len; ++j, ++p) {
+
+ // compare the strings
+ for (k = 0; k < len; ++k) {
#if 1 //~ this lowercases Latin A-Z only -- this will eventually be
//~ extended to handle other character sets
- if (p[j] >= 0x41 && p[j] <= 0x5a) {
- u1 = p[j] + 0x20;
- } else {
- u1 = p[j];
- }
- if (s[j] >= 0x41 && s[j] <= 0x5a) {
- u2 = s[j] + 0x20;
- } else {
- u2 = s[j];
- }
+ if (p[k] >= 0x41 && p[k] <= 0x5a) {
+ u1 = p[k] + 0x20;
+ } else {
+ u1 = p[k];
+ }
+ if (s[k] >= 0x41 && s[k] <= 0x5a) {
+ u2 = s[k] + 0x20;
+ } else {
+ u2 = s[k];
+ }
#endif
- if (u1 != u2) {
- break;
+ if (u1 != u2) {
+ break;
+ }
}
- }
- // found it
- if (j == len) {
- *xMin = x0;
- *xMax = line->xRight[i + len - 1];
- *yMin = line->yMin;
- *yMax = line->yMax;
- return gTrue;
+ // found it
+ if (k == len) {
+ switch (line->rot) {
+ case 0:
+ xMin1 = line->edge[j];
+ xMax1 = line->edge[j + len];
+ yMin1 = line->yMin;
+ yMax1 = line->yMax;
+ break;
+ case 1:
+ xMin1 = line->xMin;
+ xMax1 = line->xMax;
+ yMin1 = line->edge[j];
+ yMax1 = line->edge[j + len];
+ break;
+ case 2:
+ xMin1 = line->edge[j + len];
+ xMax1 = line->edge[j];
+ yMin1 = line->yMin;
+ yMax1 = line->yMax;
+ break;
+ case 3:
+ xMin1 = line->xMin;
+ xMax1 = line->xMax;
+ yMin1 = line->edge[j + len];
+ yMax1 = line->edge[j];
+ break;
+ }
+ if ((startAtTop ||
+ yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
+ (stopAtBottom ||
+ yMin1 < yStop || (yMin1 == yStop && xMin1 < yStop))) {
+ if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
+ xMin0 = xMin1;
+ xMax0 = xMax1;
+ yMin0 = yMin1;
+ yMax0 = yMax1;
+ found = gTrue;
+ }
+ }
+ }
}
}
}
+ if (found) {
+ *xMin = xMin0;
+ *xMax = xMax0;
+ *yMin = yMin0;
+ *yMax = yMax0;
+ lastFindXMin = xMin0;
+ lastFindYMin = yMin0;
+ haveLastFind = gTrue;
+ return gTrue;
+ }
+
return gFalse;
}
@@ -1725,15 +2721,24 @@ GString *TextPage::getText(double xMin, double yMin,
GString *s;
UnicodeMap *uMap;
GBool isUnicode;
- char space[8], eol[16], buf[8];
- int spaceLen, eolLen, len;
- TextLine *line, *prevLine;
- double x0, x1, y;
- int firstCol, col, i;
- GBool multiLine;
+ TextBlock *blk;
+ TextLine *line;
+ TextLineFrag *frags;
+ int nFrags, fragsSize;
+ TextLineFrag *frag;
+ char space[8], eol[16];
+ int spaceLen, eolLen;
+ int lastRot;
+ double x, y;
+ int col, idx0, idx1, i, j;
+ GBool multiLine, oneRot;
s = new GString();
+ if (rawOrder) {
+ return s;
+ }
+
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
return s;
@@ -1754,109 +2759,173 @@ GString *TextPage::getText(double xMin, double yMin,
break;
}
- // find the leftmost column
- firstCol = -1;
- for (line = lines; line; line = line->pageNext) {
- if (line->yMin > yMax) {
- break;
- }
- if (line->yMax < yMin ||
- line->xMax < xMin ||
- line->xMin > xMax) {
- continue;
- }
-
- y = 0.5 * (line->yMin + line->yMax);
- if (y < yMin || y > yMax) {
- continue;
- }
-
- i = 0;
- while (i < line->len) {
- x0 = (i==0) ? line->xMin : line->xRight[i-1];
- x1 = line->xRight[i];
- if (0.5 * (x0 + x1) > xMin) {
- break;
+ //~ writing mode (horiz/vert)
+
+ // collect the line fragments that are in the rectangle
+ fragsSize = 256;
+ frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag));
+ nFrags = 0;
+ lastRot = -1;
+ oneRot = gTrue;
+ for (i = 0; i < nBlocks; ++i) {
+ blk = blocks[i];
+ if (xMin < blk->xMax && blk->xMin < xMax &&
+ yMin < blk->yMax && blk->yMin < yMax) {
+ for (line = blk->lines; line; line = line->next) {
+ if (xMin < line->xMax && line->xMin < xMax &&
+ yMin < line->yMax && line->yMin < yMax) {
+ idx0 = idx1 = -1;
+ switch (line->rot) {
+ case 0:
+ y = 0.5 * (line->yMin + line->yMax);
+ if (yMin < y && y < yMax) {
+ j = 0;
+ while (j < line->len) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
+ idx0 = j;
+ break;
+ }
+ ++j;
+ }
+ j = line->len - 1;
+ while (j >= 0) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
+ idx1 = j;
+ break;
+ }
+ --j;
+ }
+ }
+ break;
+ case 1:
+ x = 0.5 * (line->xMin + line->xMax);
+ if (xMin < x && x < xMax) {
+ j = 0;
+ while (j < line->len) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
+ idx0 = j;
+ break;
+ }
+ ++j;
+ }
+ j = line->len - 1;
+ while (j >= 0) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
+ idx1 = j;
+ break;
+ }
+ --j;
+ }
+ }
+ break;
+ case 2:
+ y = 0.5 * (line->yMin + line->yMax);
+ if (yMin < y && y < yMax) {
+ j = 0;
+ while (j < line->len) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
+ idx0 = j;
+ break;
+ }
+ ++j;
+ }
+ j = line->len - 1;
+ while (j >= 0) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
+ idx1 = j;
+ break;
+ }
+ --j;
+ }
+ }
+ break;
+ case 3:
+ x = 0.5 * (line->xMin + line->xMax);
+ if (xMin < x && x < xMax) {
+ j = 0;
+ while (j < line->len) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
+ idx0 = j;
+ break;
+ }
+ ++j;
+ }
+ j = line->len - 1;
+ while (j >= 0) {
+ if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
+ idx1 = j;
+ break;
+ }
+ --j;
+ }
+ }
+ break;
+ }
+ if (idx0 >= 0 && idx1 >= 0) {
+ if (nFrags == fragsSize) {
+ fragsSize *= 2;
+ frags = (TextLineFrag *)
+ grealloc(frags, fragsSize * sizeof(TextLineFrag));
+ }
+ frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
+ ++nFrags;
+ if (lastRot >= 0 && line->rot != lastRot) {
+ oneRot = gFalse;
+ }
+ lastRot = line->rot;
+ }
+ }
}
- ++i;
- }
- if (i == line->len) {
- continue;
- }
- col = line->col[i];
-
- if (firstCol < 0 || col < firstCol) {
- firstCol = col;
}
}
- // extract the text
- col = firstCol;
- multiLine = gFalse;
- prevLine = NULL;
- for (line = lines; line; line = line->pageNext) {
- if (line->yMin > yMax) {
- break;
- }
- if (line->yMax < yMin ||
- line->xMax < xMin ||
- line->xMin > xMax) {
- continue;
- }
+ // sort the fragments and generate the string
+ if (nFrags > 0) {
- y = 0.5 * (line->yMin + line->yMax);
- if (y < yMin || y > yMax) {
- continue;
+ for (i = 0; i < nFrags; ++i) {
+ frags[i].computeCoords(oneRot);
}
+ assignColumns(frags, nFrags, oneRot);
- i = 0;
- while (i < line->len) {
- x0 = (i==0) ? line->xMin : line->xRight[i-1];
- x1 = line->xRight[i];
- if (0.5 * (x0 + x1) > xMin) {
- break;
- }
- ++i;
- }
- if (i == line->len) {
- continue;
- }
-
- // insert a return
- if (line->col[i] < col ||
- (prevLine &&
- line->yMin >
- prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) {
- s->append(eol, eolLen);
- col = firstCol;
- multiLine = gTrue;
- }
- prevLine = line;
-
- // line this block up with the correct column
- for (; col < line->col[i]; ++col) {
- s->append(space, spaceLen);
+ // if all lines in the region have the same rotation, use it;
+ // otherwise, use the page's primary rotation
+ if (oneRot) {
+ qsort(frags, nFrags, sizeof(TextLineFrag),
+ &TextLineFrag::cmpYXLineRot);
+ } else {
+ qsort(frags, nFrags, sizeof(TextLineFrag),
+ &TextLineFrag::cmpYXPrimaryRot);
}
- // print the portion of the line
- for (; i < line->len; ++i) {
+ col = 0;
+ multiLine = gFalse;
+ for (i = 0; i < nFrags; ++i) {
+ frag = &frags[i];
+
+ // insert a return
+ if (frag->col < col ||
+ (i > 0 && fabs(frag->base - frags[i-1].base) >
+ maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
+ s->append(eol, eolLen);
+ col = 0;
+ multiLine = gTrue;
+ }
- x0 = (i==0) ? line->xMin : line->xRight[i-1];
- x1 = line->xRight[i];
- if (0.5 * (x0 + x1) > xMax) {
- break;
+ // column alignment
+ for (; col < frag->col; ++col) {
+ s->append(space, spaceLen);
}
- len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
- s->append(buf, len);
- col += isUnicode ? 1 : len;
+ // get the fragment text
+ col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
}
- }
- if (multiLine) {
- s->append(eol, eolLen);
+ if (multiLine) {
+ s->append(eol, eolLen);
+ }
}
+ gfree(frags);
uMap->decRefCnt();
return s;
@@ -1865,58 +2934,107 @@ GString *TextPage::getText(double xMin, double yMin,
GBool TextPage::findCharRange(int pos, int length,
double *xMin, double *yMin,
double *xMax, double *yMax) {
+ TextBlock *blk;
TextLine *line;
TextWord *word;
- double x;
+ double xMin0, xMax0, yMin0, yMax0;
+ double xMin1, xMax1, yMin1, yMax1;
GBool first;
- int i;
+ int i, j0, j1;
+
+ if (rawOrder) {
+ return gFalse;
+ }
//~ this doesn't correctly handle:
//~ - ranges split across multiple lines (the highlighted region
//~ is the bounding box of all the parts of the range)
//~ - cases where characters don't convert one-to-one into Unicode
first = gTrue;
- for (line = lines; line; line = line->pageNext) {
- for (word = line->words; word; word = word->next) {
- if (pos < word->charPos + word->charLen &&
- word->charPos < pos + length) {
- i = pos - word->charPos;
- if (i < 0) {
- i = 0;
- }
- x = (i == 0) ? word->xMin : word->xRight[i - 1];
- if (first || x < *xMin) {
- *xMin = x;
- }
- i = pos + length - word->charPos;
- if (i >= word->len) {
- i = word->len - 1;
- }
- x = word->xRight[i];
- if (first || x > *xMax) {
- *xMax = x;
- }
- if (first || word->yMin < *yMin) {
- *yMin = word->yMin;
- }
- if (first || word->yMax > *yMax) {
- *yMax = word->yMax;
+ xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
+ xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
+ for (i = 0; i < nBlocks; ++i) {
+ blk = blocks[i];
+ for (line = blk->lines; line; line = line->next) {
+ for (word = line->words; word; word = word->next) {
+ if (pos < word->charPos + word->charLen &&
+ word->charPos < pos + length) {
+ j0 = pos - word->charPos;
+ if (j0 < 0) {
+ j0 = 0;
+ }
+ j1 = pos + length - 1 - word->charPos;
+ if (j1 >= word->len) {
+ j1 = word->len - 1;
+ }
+ switch (line->rot) {
+ case 0:
+ xMin1 = word->edge[j0];
+ xMax1 = word->edge[j1 + 1];
+ yMin1 = word->yMin;
+ yMax1 = word->yMax;
+ break;
+ case 1:
+ xMin1 = word->xMin;
+ xMax1 = word->xMax;
+ yMin1 = word->edge[j0];
+ yMax1 = word->edge[j1 + 1];
+ break;
+ case 2:
+ xMin1 = word->edge[j1 + 1];
+ xMax1 = word->edge[j0];
+ yMin1 = word->yMin;
+ yMax1 = word->yMax;
+ break;
+ case 3:
+ xMin1 = word->xMin;
+ xMax1 = word->xMax;
+ yMin1 = word->edge[j1 + 1];
+ yMax1 = word->edge[j0];
+ break;
+ }
+ if (first || xMin1 < xMin0) {
+ xMin0 = xMin1;
+ }
+ if (first || xMax1 > xMax0) {
+ xMax0 = xMax1;
+ }
+ if (first || yMin1 < yMin0) {
+ yMin0 = yMin1;
+ }
+ if (first || yMax1 > yMax0) {
+ yMax0 = yMax1;
+ }
+ first = gFalse;
}
- first = gFalse;
}
}
}
- return !first;
+ if (!first) {
+ *xMin = xMin0;
+ *xMax = xMax0;
+ *yMin = yMin0;
+ *yMax = yMax0;
+ return gTrue;
+ }
+ return gFalse;
}
void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
GBool physLayout) {
UnicodeMap *uMap;
- char space[8], eol[16], eop[8], buf[8];
- int spaceLen, eolLen, eopLen, len;
TextFlow *flow;
+ TextBlock *blk;
TextLine *line;
- int col, d, n, i;
+ TextLineFrag *frags;
+ TextWord *word;
+ int nFrags, fragsSize;
+ TextLineFrag *frag;
+ char space[8], eol[16], eop[8];
+ int spaceLen, eolLen, eopLen;
+ GBool pageBreaks;
+ GString *s;
+ int col, i, d, n;
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
@@ -1937,69 +3055,119 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
break;
}
eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
+ pageBreaks = globalParams->getTextPageBreaks();
- // output the page, maintaining the original physical layout
- if (physLayout || rawOrder) {
- col = 0;
- for (line = lines; line; line = line->pageNext) {
+ //~ writing mode (horiz/vert)
- // line this block up with the correct column
- if (!rawOrder) {
- for (; col < line->col[0]; ++col) {
+ // output the page in raw (content stream) order
+ if (rawOrder) {
+
+ for (word = rawWords; word; word = word->next) {
+ s = new GString();
+ dumpFragment(word->text, word->len, uMap, s);
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
+ delete s;
+ if (word->next &&
+ fabs(word->next->base - word->base) <
+ maxIntraLineDelta * word->fontSize) {
+ if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
(*outputFunc)(outputStream, space, spaceLen);
}
+ } else {
+ (*outputFunc)(outputStream, eol, eolLen);
}
+ }
- // print the line
- for (i = 0; i < line->len; ++i) {
- len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
- (*outputFunc)(outputStream, buf, len);
+ // output the page, maintaining the original physical layout
+ } else if (physLayout) {
+
+ // collect the line fragments for the page and sort them
+ fragsSize = 256;
+ frags = (TextLineFrag *)gmalloc(fragsSize * sizeof(TextLineFrag));
+ nFrags = 0;
+ for (i = 0; i < nBlocks; ++i) {
+ blk = blocks[i];
+ for (line = blk->lines; line; line = line->next) {
+ if (nFrags == fragsSize) {
+ fragsSize *= 2;
+ frags = (TextLineFrag *)grealloc(frags,
+ fragsSize * sizeof(TextLineFrag));
+ }
+ frags[nFrags].init(line, 0, line->len);
+ frags[nFrags].computeCoords(gTrue);
+ ++nFrags;
}
- col += line->convertedLen;
+ }
+ qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
- // print one or more returns if necessary
- if (rawOrder ||
- !line->pageNext ||
- line->pageNext->col[0] < col ||
- line->pageNext->yMin >
- line->yMax - lineOverlapSlack * line->fontSize) {
-
- // compute number of returns
- d = 1;
- if (line->pageNext) {
- d += (int)((line->pageNext->yMin - line->yMax) /
- line->fontSize + 0.5);
- }
+ // generate output
+ col = 0;
+ for (i = 0; i < nFrags; ++i) {
+ frag = &frags[i];
+
+ // column alignment
+ for (; col < frag->col; ++col) {
+ (*outputFunc)(outputStream, space, spaceLen);
+ }
- // various things (weird font matrices) can result in bogus
- // values here, so do a sanity check
- if (d < 1) {
+ // print the line
+ s = new GString();
+ col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
+ delete s;
+
+ // print one or more returns if necessary
+ if (i == nFrags - 1 ||
+ frags[i+1].col < col ||
+ fabs(frags[i+1].base - frag->base) >
+ maxIntraLineDelta * frag->line->words->fontSize) {
+ if (i < nFrags - 1) {
+ d = (int)((frags[i+1].base - frag->base) /
+ frag->line->words->fontSize);
+ if (d < 1) {
+ d = 1;
+ } else if (d > 5) {
+ d = 5;
+ }
+ } else {
d = 1;
- } else if (d > 5) {
- d = 5;
}
for (; d > 0; --d) {
(*outputFunc)(outputStream, eol, eolLen);
}
-
col = 0;
}
}
+ gfree(frags);
+
// output the page, "undoing" the layout
} else {
for (flow = flows; flow; flow = flow->next) {
- for (line = flow->lines; line; line = line->flowNext) {
- n = line->len;
- if (line->flowNext && line->hyphenated) {
- --n;
- }
- for (i = 0; i < n; ++i) {
- len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
- (*outputFunc)(outputStream, buf, len);
- }
- if (line->flowNext && !line->hyphenated) {
- (*outputFunc)(outputStream, space, spaceLen);
+ for (blk = flow->blocks; blk; blk = blk->next) {
+ for (line = blk->lines; line; line = line->next) {
+ n = line->len;
+ if (line->hyphenated && (line->next || blk->next)) {
+ --n;
+ }
+ s = new GString();
+ dumpFragment(line->text, n, uMap, s);
+ (*outputFunc)(outputStream, s->getCString(), s->getLength());
+ delete s;
+ if (!line->hyphenated) {
+ if (line->next) {
+ (*outputFunc)(outputStream, space, spaceLen);
+ } else if (blk->next) {
+ //~ this is a bit of a kludge - we should really do a more
+ //~ intelligent determination of paragraphs
+ if (blk->next->lines->words->fontSize ==
+ blk->lines->words->fontSize) {
+ (*outputFunc)(outputStream, space, spaceLen);
+ } else {
+ (*outputFunc)(outputStream, eol, eolLen);
+ }
+ }
+ }
}
}
(*outputFunc)(outputStream, eol, eolLen);
@@ -2008,56 +3176,196 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
}
// end of page
- (*outputFunc)(outputStream, eop, eopLen);
- (*outputFunc)(outputStream, eol, eolLen);
+ if (pageBreaks) {
+ (*outputFunc)(outputStream, eop, eopLen);
+ (*outputFunc)(outputStream, eol, eolLen);
+ }
uMap->decRefCnt();
}
-void TextPage::startPage(GfxState *state) {
- clear();
- if (state) {
- pageWidth = state->getPageWidth();
- pageHeight = state->getPageHeight();
+void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
+ TextLineFrag *frag0, *frag1;
+ int rot, col1, col2, i, j, k;
+
+ // all text in the region has the same rotation -- recompute the
+ // column numbers based only on the text in the region
+ if (oneRot) {
+ qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
+ rot = frags[0].line->rot;
+ for (i = 0; i < nFrags; ++i) {
+ frag0 = &frags[i];
+ col1 = 0;
+ for (j = 0; j < i; ++j) {
+ frag1 = &frags[j];
+ col2 = 0; // make gcc happy
+ switch (rot) {
+ case 0:
+ if (frag0->xMin >= frag1->xMax) {
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
+ frag1->line->col[frag1->start]) + 1;
+ } else {
+ for (k = frag1->start;
+ k < frag1->start + frag1->len &&
+ frag0->xMin >= 0.5 * (frag1->line->edge[k] +
+ frag1->line->edge[k+1]);
+ ++k) ;
+ col2 = frag1->col +
+ frag1->line->col[k] - frag1->line->col[frag1->start];
+ }
+ break;
+ case 1:
+ if (frag0->yMin >= frag1->yMax) {
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
+ frag1->line->col[frag1->start]) + 1;
+ } else {
+ for (k = frag1->start;
+ k < frag1->start + frag1->len &&
+ frag0->yMin >= 0.5 * (frag1->line->edge[k] +
+ frag1->line->edge[k+1]);
+ ++k) ;
+ col2 = frag1->col +
+ frag1->line->col[k] - frag1->line->col[frag1->start];
+ }
+ break;
+ case 2:
+ if (frag0->xMax <= frag1->xMin) {
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
+ frag1->line->col[frag1->start]) + 1;
+ } else {
+ for (k = frag1->start;
+ k < frag1->start + frag1->len &&
+ frag0->xMax <= 0.5 * (frag1->line->edge[k] +
+ frag1->line->edge[k+1]);
+ ++k) ;
+ col2 = frag1->col +
+ frag1->line->col[k] - frag1->line->col[frag1->start];
+ }
+ break;
+ case 3:
+ if (frag0->yMax <= frag1->yMin) {
+ col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
+ frag1->line->col[frag1->start]) + 1;
+ } else {
+ for (k = frag1->start;
+ k < frag1->start + frag1->len &&
+ frag0->yMax <= 0.5 * (frag1->line->edge[k] +
+ frag1->line->edge[k+1]);
+ ++k) ;
+ col2 = frag1->col +
+ frag1->line->col[k] - frag1->line->col[frag1->start];
+ }
+ break;
+ }
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ }
+ frag0->col = col1;
+ }
+
+ // the region includes text at different rotations -- use the
+ // globally assigned column numbers, offset by the minimum column
+ // number (i.e., shift everything over to column 0)
} else {
- pageWidth = pageHeight = 0;
+ col1 = frags[0].col;
+ for (i = 1; i < nFrags; ++i) {
+ if (frags[i].col < col1) {
+ col1 = frags[i].col;
+ }
+ }
+ for (i = 0; i < nFrags; ++i) {
+ frags[i].col -= col1;
+ }
}
}
-void TextPage::clear() {
- TextWord *w1, *w2;
- TextFlow *f1, *f2;
+int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
+ GString *s) {
+ char lre[8], rle[8], popdf[8], buf[8];
+ int lreLen, rleLen, popdfLen, n;
+ int nCols, i, j, k;
+
+ nCols = 0;
+
+ if (uMap->isUnicode()) {
+
+ lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
+ rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
+ popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
+
+ if (primaryLR) {
+
+ i = 0;
+ while (i < len) {
+ // output a left-to-right section
+ for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
+ for (k = i; k < j; ++k) {
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+ s->append(buf, n);
+ ++nCols;
+ }
+ i = j;
+ // output a right-to-left section
+ for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
+ if (j > i) {
+ s->append(rle, rleLen);
+ for (k = j - 1; k >= i; --k) {
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+ s->append(buf, n);
+ ++nCols;
+ }
+ s->append(popdf, popdfLen);
+ i = j;
+ }
+ }
+
+ } else {
+
+ s->append(rle, rleLen);
+ i = len - 1;
+ while (i >= 0) {
+ // output a right-to-left section
+ for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
+ for (k = i; k > j; --k) {
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+ s->append(buf, n);
+ ++nCols;
+ }
+ i = j;
+ // output a left-to-right section
+ for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
+ if (j < i) {
+ s->append(lre, lreLen);
+ for (k = j + 1; k <= i; ++k) {
+ n = uMap->mapUnicode(text[k], buf, sizeof(buf));
+ s->append(buf, n);
+ ++nCols;
+ }
+ s->append(popdf, popdfLen);
+ i = j;
+ }
+ }
+ s->append(popdf, popdfLen);
- if (curWord) {
- delete curWord;
- curWord = NULL;
- }
- if (words) {
- for (w1 = words; w1; w1 = w2) {
- w2 = w1->next;
- delete w1;
}
- } else if (flows) {
- for (f1 = flows; f1; f1 = f2) {
- f2 = f1->next;
- delete f1;
+
+ } else {
+ for (i = 0; i < len; ++i) {
+ n = uMap->mapUnicode(text[i], buf, sizeof(buf));
+ s->append(buf, n);
+ nCols += n;
}
}
- deleteGList(fonts, TextFontInfo);
-
- curWord = NULL;
- charPos = 0;
- font = NULL;
- fontSize = 0;
- nest = 0;
- nTinyChars = 0;
- words = wordPtr = NULL;
- lines = NULL;
- flows = NULL;
- fonts = new GList();
+ return nCols;
}
+#if TEXTOUT_WORD_LIST
+TextWordList *TextPage::makeWordList(GBool physLayout) {
+ return new TextWordList(this, physLayout);
+}
+#endif
//------------------------------------------------------------------------
// TextOutputDev
@@ -2153,10 +3461,12 @@ void TextOutputDev::drawChar(GfxState *state, double x, double y,
}
GBool TextOutputDev::findText(Unicode *s, int len,
- GBool top, GBool bottom,
+ GBool startAtTop, GBool stopAtBottom,
+ GBool startAtLast, GBool stopAtLast,
double *xMin, double *yMin,
double *xMax, double *yMax) {
- return text->findText(s, len, top, bottom, xMin, yMin, xMax, yMax);
+ return text->findText(s, len, startAtTop, stopAtBottom,
+ startAtLast, stopAtLast, xMin, yMin, xMax, yMax);
}
GString *TextOutputDev::getText(double xMin, double yMin,
@@ -2170,4 +3480,8 @@ GBool TextOutputDev::findCharRange(int pos, int length,
return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
}
-
+#if TEXTOUT_WORD_LIST
+TextWordList *TextOutputDev::makeWordList() {
+ return text->makeWordList(physLayout);
+}
+#endif