Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Kretzschmar <mkretzschmar@src.gnome.org>2003-03-31 23:23:17 (GMT)
committer Martin Kretzschmar <mkretzschmar@src.gnome.org>2003-03-31 23:23:17 (GMT)
commit8032fd96d450ac015c0153f1fa57e974d67b4993 (patch)
treedba81f5197969c0a5e55c50d3474c2cc817b3785
parent9ac495d6543dbd65992791bb41d5f8fbf90e549c (diff)
update
* ANNOUNCE, CHANGES, README, aconf-win32.h: update * xpdf/CharCodeToUnicode.cc, xpdf/Decrypt.cc, xpdf/FTFont.cc, xpdf/FTFont.h, xpdf/FontEncodingTables.cc, xpdf/Gfx.cc, xpdf/GfxFont.cc, xpdf/GfxState.cc, xpdf/GfxState.h, xpdf/GlobalParams.cc, xpdf/GlobalParams.h, xpdf/Link.cc, xpdf/NameToUnicodeTable.h, xpdf/Stream.cc, xpdf/TextOutputDev.cc, xpdf/TextOutputDev.h, xpdf/XOutputDev.cc, xpdf/config.h, xpdf/pdftotext.cc, xpdf/xpdf.cc, xpdf/Outline.cc, xpdf/XPDFApp.cc, xpdf/XPDFApp.h, xpdf/XPDFCore.cc, xpdf/XPDFCore.h, xpdf/XPDFViewer.cc, xpdf/XPDFViewer.h: update. * goo/gfile.cc: update. * goo/Makefile.am: use GMutex.h * doc/pdffonts.1, doc/pdffonts.cat, doc/pdfimages.1, doc/pdfimages.cat, doc/pdfinfo.1, doc/pdfinfo.cat, doc/pdftopbm.1, doc/pdftopbm.cat, doc/pdftops.1, doc/pdftops.cat, doc/pdftotext.1, doc/pdftotext.cat, doc/pdftotext.hlp, doc/xpdf.1, doc/xpdf.cat, doc/xpdf.hlp, doc/xpdfrc.5, doc/xpdfrc.cat, doc/xpdfrc.hlp: update
-rw-r--r--pdf/goo/Makefile.am1
-rw-r--r--pdf/goo/gfile.cc28
-rw-r--r--pdf/xpdf/CharCodeToUnicode.cc2
-rw-r--r--pdf/xpdf/Decrypt.cc32
-rw-r--r--pdf/xpdf/FTFont.cc42
-rw-r--r--pdf/xpdf/FTFont.h3
-rw-r--r--pdf/xpdf/Gfx.cc9
-rw-r--r--pdf/xpdf/GfxFont.cc9
-rw-r--r--pdf/xpdf/GfxState.cc71
-rw-r--r--pdf/xpdf/GfxState.h11
-rw-r--r--pdf/xpdf/GlobalParams.cc301
-rw-r--r--pdf/xpdf/GlobalParams.h48
-rw-r--r--pdf/xpdf/Link.cc64
-rw-r--r--pdf/xpdf/NameToUnicodeTable.h12
-rw-r--r--pdf/xpdf/Stream.cc3
-rw-r--r--pdf/xpdf/TextOutputDev.cc2212
-rw-r--r--pdf/xpdf/TextOutputDev.h228
-rw-r--r--pdf/xpdf/XOutputDev.cc74
-rw-r--r--pdf/xpdf/pdftotext.cc6
-rw-r--r--pdf/xpdf/xpdf.cc16
-rw-r--r--pdf/xpdf/xpdfconfig.h8
21 files changed, 2271 insertions, 909 deletions
diff --git a/pdf/goo/Makefile.am b/pdf/goo/Makefile.am
index 24f89d8..b08ca56 100644
--- a/pdf/goo/Makefile.am
+++ b/pdf/goo/Makefile.am
@@ -5,6 +5,7 @@ libgoo_a_SOURCES = \
GHash.h \
GList.cc \
GList.h \
+ GMutex.h \
GString.cc \
GString.h \
gmempp.cc \
diff --git a/pdf/goo/gfile.cc b/pdf/goo/gfile.cc
index d6d2363..e6603c6 100644
--- a/pdf/goo/gfile.cc
+++ b/pdf/goo/gfile.cc
@@ -447,8 +447,6 @@ GBool openTempFile(GString **name, FILE **f, char *mode, char *ext) {
#if defined(WIN32)
//---------- Win32 ----------
char *s;
- char buf[_MAX_PATH];
- char *fp;
if (!(s = _tempnam(getenv("TEMP"), NULL))) {
return gFalse;
@@ -646,10 +644,8 @@ GDir::~GDir() {
}
GDirEntry *GDir::getNextEntry() {
- struct dirent *ent;
GDirEntry *e;
- e = NULL;
#if defined(WIN32)
e = new GDirEntry(path->getCString(), ffd.cFileName, doStat);
if (hnd && !FindNextFile(hnd, &ffd)) {
@@ -658,24 +654,34 @@ GDirEntry *GDir::getNextEntry() {
}
#elif defined(ACORN)
#elif defined(MACOS)
-#else
+#elif defined(VMS)
+ struct dirent *ent;
+ e = NULL;
if (dir) {
-#ifdef VMS
if (needParent) {
e = new GDirEntry(path->getCString(), "-", doStat);
needParent = gFalse;
return e;
}
-#endif
ent = readdir(dir);
-#ifndef VMS
- if (ent && !strcmp(ent->d_name, "."))
+ if (ent) {
+ e = new GDirEntry(path->getCString(), ent->d_name, doStat);
+ }
+ }
+#else
+ struct dirent *ent;
+ e = NULL;
+ if (dir) {
+ ent = readdir(dir);
+ if (ent && !strcmp(ent->d_name, ".")) {
ent = readdir(dir);
-#endif
- if (ent)
+ }
+ if (ent) {
e = new GDirEntry(path->getCString(), ent->d_name, doStat);
+ }
}
#endif
+
return e;
}
diff --git a/pdf/xpdf/CharCodeToUnicode.cc b/pdf/xpdf/CharCodeToUnicode.cc
index f61d400..e2fecbc 100644
--- a/pdf/xpdf/CharCodeToUnicode.cc
+++ b/pdf/xpdf/CharCodeToUnicode.cc
@@ -224,7 +224,7 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data,
map[i] = 0;
}
}
- if (n3 == 6) {
+ if (n3 <= 6) {
if (sscanf(tok3 + 1, "%x", &u) != 1) {
error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
continue;
diff --git a/pdf/xpdf/Decrypt.cc b/pdf/xpdf/Decrypt.cc
index bb3e3f1..b58a6c5 100644
--- a/pdf/xpdf/Decrypt.cc
+++ b/pdf/xpdf/Decrypt.cc
@@ -382,20 +382,20 @@ static void md5(Guchar *msg, int msgLen, Guchar *digest) {
}
// break digest into bytes
- digest[0] = a & 0xff;
- digest[1] = (a >>= 8) & 0xff;
- digest[2] = (a >>= 8) & 0xff;
- digest[3] = (a >>= 8) & 0xff;
- digest[4] = b & 0xff;
- digest[5] = (b >>= 8) & 0xff;
- digest[6] = (b >>= 8) & 0xff;
- digest[7] = (b >>= 8) & 0xff;
- digest[8] = c & 0xff;
- digest[9] = (c >>= 8) & 0xff;
- digest[10] = (c >>= 8) & 0xff;
- digest[11] = (c >>= 8) & 0xff;
- digest[12] = d & 0xff;
- digest[13] = (d >>= 8) & 0xff;
- digest[14] = (d >>= 8) & 0xff;
- digest[15] = (d >>= 8) & 0xff;
+ digest[0] = (Guchar)(a & 0xff);
+ digest[1] = (Guchar)((a >>= 8) & 0xff);
+ digest[2] = (Guchar)((a >>= 8) & 0xff);
+ digest[3] = (Guchar)((a >>= 8) & 0xff);
+ digest[4] = (Guchar)(b & 0xff);
+ digest[5] = (Guchar)((b >>= 8) & 0xff);
+ digest[6] = (Guchar)((b >>= 8) & 0xff);
+ digest[7] = (Guchar)((b >>= 8) & 0xff);
+ digest[8] = (Guchar)(c & 0xff);
+ digest[9] = (Guchar)((c >>= 8) & 0xff);
+ digest[10] = (Guchar)((c >>= 8) & 0xff);
+ digest[11] = (Guchar)((c >>= 8) & 0xff);
+ digest[12] = (Guchar)(d & 0xff);
+ digest[13] = (Guchar)((d >>= 8) & 0xff);
+ digest[14] = (Guchar)((d >>= 8) & 0xff);
+ digest[15] = (Guchar)((d >>= 8) & 0xff);
}
diff --git a/pdf/xpdf/FTFont.cc b/pdf/xpdf/FTFont.cc
index 8de09e0..ab101ac 100644
--- a/pdf/xpdf/FTFont.cc
+++ b/pdf/xpdf/FTFont.cc
@@ -56,6 +56,9 @@ FTFontFile::FTFontFile(FTFontEngine *engineA, char *fontFileName,
ok = gFalse;
engine = engineA;
codeMap = NULL;
+ cidToGID = NULL;
+ cidToGIDLen = 0;
+
if (FT_New_Face(engine->lib, fontFileName, 0, &face)) {
return;
}
@@ -144,11 +147,15 @@ FTFontFile::FTFontFile(FTFontEngine *engineA, char *fontFileName,
ok = gFalse;
engine = engineA;
codeMap = NULL;
+ cidToGID = NULL;
+ cidToGIDLen = 0;
+
if (FT_New_Face(engine->lib, fontFileName, 0, &face)) {
return;
}
- cidToGID = cidToGIDA;
cidToGIDLen = cidToGIDLenA;
+ cidToGID = (Gushort *)gmalloc(cidToGIDLen * sizeof(Gushort));
+ memcpy(cidToGID, cidToGIDA, cidToGIDLen * sizeof(Gushort));
mode = ftFontModeCIDToGIDMap;
ok = gTrue;
}
@@ -157,12 +164,17 @@ FTFontFile::FTFontFile(FTFontEngine *engineA, char *fontFileName) {
ok = gFalse;
engine = engineA;
codeMap = NULL;
+ cidToGID = NULL;
+ cidToGIDLen = 0;
+
if (FT_New_Face(engine->lib, fontFileName, 0, &face)) {
return;
}
- cidToGID = NULL;
- cidToGIDLen = 0;
- mode = ftFontModeCFFCharset;
+ if (!strcmp(face->driver->root.clazz->module_name, "t1cid")) {
+ mode = ftFontModeCID;
+ } else {
+ mode = ftFontModeCFFCharset;
+ }
ok = gTrue;
}
@@ -173,6 +185,9 @@ FTFontFile::~FTFontFile() {
if (codeMap) {
gfree(codeMap);
}
+ if (cidToGID) {
+ gfree(cidToGID);
+ }
}
//------------------------------------------------------------------------
@@ -664,20 +679,25 @@ FT_UInt FTFont::getGlyphIndex(CharCode c, Unicode u) {
break;
case ftFontModeCFFCharset:
#if 1 //~ cff cid->gid map
+ {
#if FREETYPE_MAJOR == 2 && FREETYPE_MINOR == 0
- CFF_Font *cff = (CFF_Font *)((TT_Face)fontFile->face)->extra.data;
+ CFF_Font *cff = (CFF_Font *)((TT_Face)fontFile->face)->extra.data;
#else
- CFF_Font cff = (CFF_Font)((TT_Face)fontFile->face)->extra.data;
+ CFF_Font cff = (CFF_Font)((TT_Face)fontFile->face)->extra.data;
#endif
- idx = 0;
- for (j = 0; j < (int)cff->num_glyphs; ++j) {
- if (cff->charset.sids[j] == c) {
- idx = j;
- break;
+ idx = 0;
+ for (j = 0; j < (int)cff->num_glyphs; ++j) {
+ if (cff->charset.sids[j] == c) {
+ idx = j;
+ break;
+ }
}
}
#endif
break;
+ case ftFontModeCID:
+ idx = c;
+ break;
}
return idx;
}
diff --git a/pdf/xpdf/FTFont.h b/pdf/xpdf/FTFont.h
index 02c257a..32675c6 100644
--- a/pdf/xpdf/FTFont.h
+++ b/pdf/xpdf/FTFont.h
@@ -53,7 +53,8 @@ enum FTFontIndexMode {
ftFontModeCodeMap,
ftFontModeCodeMapDirect,
ftFontModeCIDToGIDMap,
- ftFontModeCFFCharset
+ ftFontModeCFFCharset,
+ ftFontModeCID
};
class FTFontFile: public SFontFile {
diff --git a/pdf/xpdf/Gfx.cc b/pdf/xpdf/Gfx.cc
index 2717a04..21136b1 100644
--- a/pdf/xpdf/Gfx.cc
+++ b/pdf/xpdf/Gfx.cc
@@ -1825,7 +1825,7 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) {
}
void Gfx::doEndPath() {
- if (state->isPath() && clip != clipNone) {
+ if (state->isCurPt() && clip != clipNone) {
state->clip();
if (clip == clipNormal) {
out->clip(state);
@@ -2038,7 +2038,7 @@ void Gfx::doShowText(GString *s) {
double riseX, riseY;
CharCode code;
Unicode u[8];
- double x, y, dx, dy, dx2, dy2, curX, curY, tdx, tdy;
+ double x, y, dx, dy, dx2, dy2, curX, curY, tdx, tdy, lineX, lineY;
double originX, originY, tOriginX, tOriginY;
double oldCTM[6], newCTM[6];
double *mat;
@@ -2082,6 +2082,8 @@ void Gfx::doShowText(GString *s) {
state->textTransformDelta(0, state->getRise(), &riseX, &riseY);
curX = state->getCurX();
curY = state->getCurY();
+ lineX = state->getLineX();
+ lineY = state->getLineY();
oldParser = parser;
p = s->getCString();
len = s->getLength();
@@ -2120,10 +2122,11 @@ void Gfx::doShowText(GString *s) {
state = state->restore();
out->restoreState(state);
// GfxState::restore() does *not* restore the current position,
- // so we track it here with (curX, curY)
+ // so we deal with it here using (curX, curY) and (lineX, lineY)
curX += tdx;
curY += tdy;
state->moveTo(curX, curY);
+ state->textSetPos(lineX, lineY);
p += n;
len -= n;
}
diff --git a/pdf/xpdf/GfxFont.cc b/pdf/xpdf/GfxFont.cc
index 5acb845..b3b6a71 100644
--- a/pdf/xpdf/GfxFont.cc
+++ b/pdf/xpdf/GfxFont.cc
@@ -66,6 +66,9 @@ static StdFontMapEntry stdFontMap[] = {
{ "Helvetica,Italic", "Helvetica-Oblique" },
{ "Helvetica-BoldItalic", "Helvetica-BoldOblique" },
{ "Helvetica-Italic", "Helvetica-Oblique" },
+ { "Symbol,Bold", "Symbol" },
+ { "Symbol,BoldItalic", "Symbol" },
+ { "Symbol,Italic", "Symbol" },
{ "TimesNewRoman", "Times-Roman" },
{ "TimesNewRoman,Bold", "Times-Bold" },
{ "TimesNewRoman,BoldItalic", "Times-BoldItalic" },
@@ -256,6 +259,10 @@ void GfxFont::readFontDescriptor(XRef *xref, Dict *fontDict) {
if (t != 0) {
descent = t;
}
+ // some broken font descriptors specify a positive descent
+ if (descent > 0) {
+ descent = -descent;
+ }
}
obj2.free();
@@ -949,7 +956,7 @@ GfxCIDFont::GfxCIDFont(XRef *xref, char *tagA, Ref idA, GString *nameA,
// CIDToGIDMap (for embedded TrueType fonts)
if (type == fontCIDType2) {
- fontDict->lookup("CIDToGIDMap", &obj1);
+ desFontDict->lookup("CIDToGIDMap", &obj1);
if (obj1.isStream()) {
cidToGIDLen = 0;
i = 64;
diff --git a/pdf/xpdf/GfxState.cc b/pdf/xpdf/GfxState.cc
index d968ac1..a978b50 100644
--- a/pdf/xpdf/GfxState.cc
+++ b/pdf/xpdf/GfxState.cc
@@ -29,6 +29,24 @@ static inline double clip01(double x) {
}
//------------------------------------------------------------------------
+
+static char *gfxColorSpaceModeNames[] = {
+ "DeviceGray",
+ "CalGray",
+ "DeviceRGB",
+ "CalRGB",
+ "DeviceCMYK",
+ "Lab",
+ "ICCBased",
+ "Indexed",
+ "Separation",
+ "DeviceN",
+ "Pattern"
+};
+
+#define nGfxColorSpaceModes ((sizeof(gfxColorSpaceModeNames) / sizeof(char *)))
+
+//------------------------------------------------------------------------
// GfxColorSpace
//------------------------------------------------------------------------
@@ -99,6 +117,14 @@ void GfxColorSpace::getDefaultRanges(double *decodeLow, double *decodeRange,
}
}
+int GfxColorSpace::getNumColorSpaceModes() {
+ return nGfxColorSpaceModes;
+}
+
+char *GfxColorSpace::getColorSpaceModeName(int idx) {
+ return gfxColorSpaceModeNames[idx];
+}
+
//------------------------------------------------------------------------
// GfxDeviceGrayColorSpace
//------------------------------------------------------------------------
@@ -850,9 +876,9 @@ GfxColorSpace *GfxIndexedColorSpace::parse(Array *arr) {
return NULL;
}
-void GfxIndexedColorSpace::getGray(GfxColor *color, double *gray) {
+GfxColor *GfxIndexedColorSpace::mapColorToBase(GfxColor *color,
+ GfxColor *baseColor) {
Guchar *p;
- GfxColor color2;
double low[gfxColorMaxComps], range[gfxColorMaxComps];
int n, i;
@@ -860,39 +886,27 @@ void GfxIndexedColorSpace::getGray(GfxColor *color, double *gray) {
base->getDefaultRanges(low, range, indexHigh);
p = &lookup[(int)(color->c[0] + 0.5) * n];
for (i = 0; i < n; ++i) {
- color2.c[i] = low[i] + (p[i] / 255.0) * range[i];
+ baseColor->c[i] = low[i] + (p[i] / 255.0) * range[i];
}
- base->getGray(&color2, gray);
+ return baseColor;
+}
+
+void GfxIndexedColorSpace::getGray(GfxColor *color, double *gray) {
+ GfxColor color2;
+
+ base->getGray(mapColorToBase(color, &color2), gray);
}
void GfxIndexedColorSpace::getRGB(GfxColor *color, GfxRGB *rgb) {
- Guchar *p;
GfxColor color2;
- double low[gfxColorMaxComps], range[gfxColorMaxComps];
- int n, i;
- n = base->getNComps();
- base->getDefaultRanges(low, range, indexHigh);
- p = &lookup[(int)(color->c[0] + 0.5) * n];
- for (i = 0; i < n; ++i) {
- color2.c[i] = low[i] + (p[i] / 255.0) * range[i];
- }
- base->getRGB(&color2, rgb);
+ base->getRGB(mapColorToBase(color, &color2), rgb);
}
void GfxIndexedColorSpace::getCMYK(GfxColor *color, GfxCMYK *cmyk) {
- Guchar *p;
GfxColor color2;
- double low[gfxColorMaxComps], range[gfxColorMaxComps];
- int n, i;
- n = base->getNComps();
- base->getDefaultRanges(low, range, indexHigh);
- p = &lookup[(int)(color->c[0] + 0.5) * n];
- for (i = 0; i < n; ++i) {
- color2.c[i] = low[i] + (p[i] / 255.0) * range[i];
- }
- base->getCMYK(&color2, cmyk);
+ base->getCMYK(mapColorToBase(color, &color2), cmyk);
}
void GfxIndexedColorSpace::getDefaultRanges(double *decodeLow,
@@ -1789,6 +1803,15 @@ void GfxImageColorMap::getCMYK(Guchar *x, GfxCMYK *cmyk) {
}
}
+void GfxImageColorMap::getColor(Guchar *x, GfxColor *color) {
+ int maxPixel, i;
+
+ maxPixel = (1 << bits) - 1;
+ for (i = 0; i < nComps; ++i) {
+ color->c[i] = decodeLow[i] + (x[i] * decodeRange[i]) / maxPixel;
+ }
+}
+
//------------------------------------------------------------------------
// GfxSubpath and GfxPath
//------------------------------------------------------------------------
diff --git a/pdf/xpdf/GfxState.h b/pdf/xpdf/GfxState.h
index e99735c..cfe8f9b 100644
--- a/pdf/xpdf/GfxState.h
+++ b/pdf/xpdf/GfxState.h
@@ -53,6 +53,8 @@ struct GfxCMYK {
// GfxColorSpace
//------------------------------------------------------------------------
+// NB: The nGfxColorSpaceModes constant and the gfxColorSpaceModeNames
+// array defined in GfxState.cc must match this enum.
enum GfxColorSpaceMode {
csDeviceGray,
csCalGray,
@@ -91,6 +93,12 @@ public:
virtual void getDefaultRanges(double *decodeLow, double *decodeRange,
int maxImgPixel);
+ // Return the number of color space modes
+ static int getNumColorSpaceModes();
+
+ // Return the name of the <idx>th color space mode.
+ static char *getColorSpaceModeName(int idx);
+
private:
};
@@ -344,6 +352,7 @@ public:
GfxColorSpace *getBase() { return base; }
int getIndexHigh() { return indexHigh; }
Guchar *getLookup() { return lookup; }
+ GfxColor *mapColorToBase(GfxColor *color, GfxColor *baseColor);
private:
@@ -636,6 +645,7 @@ public:
void getGray(Guchar *x, double *gray);
void getRGB(Guchar *x, GfxRGB *rgb);
void getCMYK(Guchar *x, GfxCMYK *cmyk);
+ void getColor(Guchar *x, GfxColor *color);
private:
@@ -902,6 +912,7 @@ public:
void clip();
// Text position.
+ void textSetPos(double tx, double ty) { lineX = tx; lineY = ty; }
void textMoveTo(double tx, double ty)
{ lineX = tx; lineY = ty; textTransform(tx, ty, &curX, &curY); }
void textShift(double tx, double ty);
diff --git a/pdf/xpdf/GlobalParams.cc b/pdf/xpdf/GlobalParams.cc
index b50c15b..ded583f 100644
--- a/pdf/xpdf/GlobalParams.cc
+++ b/pdf/xpdf/GlobalParams.cc
@@ -31,6 +31,14 @@
#include "FontEncodingTables.h"
#include "GlobalParams.h"
+#if MULTITHREADED
+# define globalParamsLock gLockMutex(&mutex)
+# define globalParamsUnlock gUnlockMutex(&mutex)
+#else
+# define globalParamsLock
+# define globalParamsUnlock
+#endif
+
#include "NameToUnicodeTable.h"
#include "UnicodeMapTables.h"
#include "DisplayFontTable.h"
@@ -124,6 +132,10 @@ GlobalParams::GlobalParams(char *cfgFileName) {
FILE *f;
int i;
+#if MULTITHREADED
+ gInitMutex(&mutex);
+#endif
+
initBuiltinFontTables();
// scan the encoding in reverse because we want the lowest-numbered
@@ -276,7 +288,7 @@ void GlobalParams::parseFile(GString *fileName, FILE *f) {
FILE *f2;
line = 1;
- while (fgets(buf, sizeof(buf) - 1, f)) {
+ while (getLine(buf, sizeof(buf) - 1, f)) {
// break the line into tokens
tokens = new GList();
@@ -293,7 +305,7 @@ void GlobalParams::parseFile(GString *fileName, FILE *f) {
for (p2 = p1 + 1; *p2 && !isspace(*p2); ++p2) ;
}
tokens->append(new GString(p1, p2 - p1));
- p1 = p2 + 1;
+ p1 = *p2 ? p2 + 1 : p2;
}
if (tokens->getLength() > 0 &&
@@ -329,12 +341,18 @@ void GlobalParams::parseFile(GString *fileName, FILE *f) {
parseDisplayFont(tokens, displayFonts, displayFontT1, fileName, line);
} else if (!cmd->cmp("displayFontTT")) {
parseDisplayFont(tokens, displayFonts, displayFontTT, fileName, line);
+ } else if (!cmd->cmp("displayNamedCIDFontX")) {
+ parseDisplayFont(tokens, displayNamedCIDFonts,
+ displayFontX, fileName, line);
} else if (!cmd->cmp("displayCIDFontX")) {
parseDisplayFont(tokens, displayCIDFonts,
displayFontX, fileName, line);
- } else if (!cmd->cmp("displayNamedCIDFontX")) {
+ } else if (!cmd->cmp("displayNamedCIDFontT1")) {
parseDisplayFont(tokens, displayNamedCIDFonts,
- displayFontX, fileName, line);
+ displayFontT1, fileName, line);
+ } else if (!cmd->cmp("displayCIDFontT1")) {
+ parseDisplayFont(tokens, displayCIDFonts,
+ displayFontT1, fileName, line);
} else if (!cmd->cmp("psFile")) {
parsePSFile(tokens, fileName, line);
} else if (!cmd->cmp("psFont")) {
@@ -428,7 +446,7 @@ void GlobalParams::parseNameToUnicode(GList *tokens, GString *fileName,
return;
}
line2 = 1;
- while (fgets(buf, sizeof(buf), f)) {
+ while (getLine(buf, sizeof(buf), f)) {
tok1 = strtok(buf, " \t\r\n");
tok2 = strtok(NULL, " \t\r\n");
if (tok1 && tok2) {
@@ -794,6 +812,10 @@ GlobalParams::~GlobalParams() {
delete cidToUnicodeCache;
delete unicodeMapCache;
delete cMapCache;
+
+#if MULTITHREADED
+ gDestroyMutex(&mutex);
+#endif
}
//------------------------------------------------------------------------
@@ -870,7 +892,12 @@ FILE *GlobalParams::findToUnicodeFile(GString *name) {
}
DisplayFontParam *GlobalParams::getDisplayFont(GString *fontName) {
- return (DisplayFontParam *)displayFonts->lookup(fontName);
+ DisplayFontParam *dfp;
+
+ globalParamsLock;
+ dfp = (DisplayFontParam *)displayFonts->lookup(fontName);
+ globalParamsUnlock;
+ return dfp;
}
DisplayFontParam *GlobalParams::getDisplayCIDFont(GString *fontName,
@@ -884,6 +911,51 @@ DisplayFontParam *GlobalParams::getDisplayCIDFont(GString *fontName,
return dfp;
}
+GString *GlobalParams::getPSFile() {
+ GString *s;
+
+ globalParamsLock;
+ s = psFile ? psFile->copy() : (GString *)NULL;
+ globalParamsUnlock;
+ return s;
+}
+
+int GlobalParams::getPSPaperWidth() {
+ int w;
+
+ globalParamsLock;
+ w = psPaperWidth;
+ globalParamsUnlock;
+ return w;
+}
+
+int GlobalParams::getPSPaperHeight() {
+ int h;
+
+ globalParamsLock;
+ h = psPaperHeight;
+ globalParamsUnlock;
+ return h;
+}
+
+GBool GlobalParams::getPSDuplex() {
+ GBool d;
+
+ globalParamsLock;
+ d = psDuplex;
+ globalParamsUnlock;
+ return d;
+}
+
+PSLevel GlobalParams::getPSLevel() {
+ PSLevel level;
+
+ globalParamsLock;
+ level = psLevel;
+ globalParamsUnlock;
+ return level;
+}
+
PSFontParam *GlobalParams::getPSFont(GString *fontName) {
return (PSFontParam *)psFonts->lookup(fontName);
}
@@ -917,6 +989,78 @@ PSFontParam *GlobalParams::getPSFont16(GString *fontName,
return p;
}
+GBool GlobalParams::getPSEmbedType1() {
+ GBool e;
+
+ globalParamsLock;
+ e = psEmbedType1;
+ globalParamsUnlock;
+ return e;
+}
+
+GBool GlobalParams::getPSEmbedTrueType() {
+ GBool e;
+
+ globalParamsLock;
+ e = psEmbedTrueType;
+ globalParamsUnlock;
+ return e;
+}
+
+GBool GlobalParams::getPSEmbedCIDPostScript() {
+ GBool e;
+
+ globalParamsLock;
+ e = psEmbedCIDPostScript;
+ globalParamsUnlock;
+ return e;
+}
+
+GBool GlobalParams::getPSEmbedCIDTrueType() {
+ GBool e;
+
+ globalParamsLock;
+ e = psEmbedCIDTrueType;
+ globalParamsUnlock;
+ return e;
+}
+
+GBool GlobalParams::getPSOPI() {
+ GBool opi;
+
+ globalParamsLock;
+ opi = psOPI;
+ globalParamsUnlock;
+ return opi;
+}
+
+GBool GlobalParams::getPSASCIIHex() {
+ GBool ah;
+
+ globalParamsLock;
+ ah = psASCIIHex;
+ globalParamsUnlock;
+ return ah;
+}
+
+EndOfLineKind GlobalParams::getTextEOL() {
+ EndOfLineKind eol;
+
+ globalParamsLock;
+ eol = textEOL;
+ globalParamsUnlock;
+ return eol;
+}
+
+GBool GlobalParams::getTextKeepTinyChars() {
+ GBool tiny;
+
+ globalParamsLock;
+ tiny = textKeepTinyChars;
+ globalParamsUnlock;
+ return tiny;
+}
+
GString *GlobalParams::findFontFile(GString *fontName,
char *ext1, char *ext2) {
GString *dir, *fileName;
@@ -947,26 +1091,105 @@ GString *GlobalParams::findFontFile(GString *fontName,
return NULL;
}
+GString *GlobalParams::getInitialZoom() {
+ GString *s;
+
+ globalParamsLock;
+ s = initialZoom->copy();
+ globalParamsUnlock;
+ return s;
+}
+
+FontRastControl GlobalParams::getT1libControl() {
+ FontRastControl c;
+
+ globalParamsLock;
+ c = t1libControl;
+ globalParamsUnlock;
+ return c;
+}
+
+FontRastControl GlobalParams::getFreeTypeControl() {
+ FontRastControl c;
+
+ globalParamsLock;
+ c = freetypeControl;
+ globalParamsUnlock;
+ return c;
+}
+
+GBool GlobalParams::getMapNumericCharNames() {
+ GBool map;
+
+ globalParamsLock;
+ map = mapNumericCharNames;
+ globalParamsUnlock;
+ return map;
+}
+
+GBool GlobalParams::getPrintCommands() {
+ GBool p;
+
+ globalParamsLock;
+ p = printCommands;
+ globalParamsUnlock;
+ return p;
+}
+
+GBool GlobalParams::getErrQuiet() {
+ GBool q;
+
+ globalParamsLock;
+ q = errQuiet;
+ globalParamsUnlock;
+ return q;
+}
+
CharCodeToUnicode *GlobalParams::getCIDToUnicode(GString *collection) {
- return cidToUnicodeCache->getCIDToUnicode(collection);
+ CharCodeToUnicode *ctu;
+
+ globalParamsLock;
+ ctu = cidToUnicodeCache->getCIDToUnicode(collection);
+ globalParamsUnlock;
+ return ctu;
}
UnicodeMap *GlobalParams::getUnicodeMap(GString *encodingName) {
UnicodeMap *map;
+ globalParamsLock;
+ map = getUnicodeMap2(encodingName);
+ globalParamsUnlock;
+ return map;
+}
+
+UnicodeMap *GlobalParams::getUnicodeMap2(GString *encodingName) {
+ UnicodeMap *map;
+
if ((map = getResidentUnicodeMap(encodingName))) {
map->incRefCnt();
- return map;
+ } else {
+ map = unicodeMapCache->getUnicodeMap(encodingName);
}
- return unicodeMapCache->getUnicodeMap(encodingName);
+ return map;
}
CMap *GlobalParams::getCMap(GString *collection, GString *cMapName) {
- return cMapCache->getCMap(collection, cMapName);
+ CMap *cMap;
+
+ globalParamsLock;
+ cMap = cMapCache->getCMap(collection, cMapName);
+ globalParamsUnlock;
+ return cMap;
}
UnicodeMap *GlobalParams::getTextEncoding() {
- return getUnicodeMap(textEncoding);
+ UnicodeMap *map;
+
+ globalParamsLock;
+ map = getUnicodeMap2(textEncoding);
+ globalParamsUnlock;
+ return map;
}
//------------------------------------------------------------------------
@@ -976,20 +1199,25 @@ UnicodeMap *GlobalParams::getTextEncoding() {
void GlobalParams::addDisplayFont(DisplayFontParam *param) {
DisplayFontParam *old;
+ globalParamsLock;
if ((old = (DisplayFontParam *)displayFonts->remove(param->name))) {
delete old;
}
displayFonts->add(param->name, param);
+ globalParamsUnlock;
}
void GlobalParams::setPSFile(char *file) {
+ globalParamsLock;
if (psFile) {
delete psFile;
}
psFile = new GString(file);
+ globalParamsUnlock;
}
GBool GlobalParams::setPSPaperSize(char *size) {
+ globalParamsLock;
if (!strcmp(size, "letter")) {
psPaperWidth = 612;
psPaperHeight = 792;
@@ -1003,57 +1231,82 @@ GBool GlobalParams::setPSPaperSize(char *size) {
psPaperWidth = 842;
psPaperHeight = 1190;
} else {
+ globalParamsUnlock;
return gFalse;
}
+ globalParamsUnlock;
return gTrue;
}
void GlobalParams::setPSPaperWidth(int width) {
+ globalParamsLock;
psPaperWidth = width;
+ globalParamsUnlock;
}
void GlobalParams::setPSPaperHeight(int height) {
+ globalParamsLock;
psPaperHeight = height;
+ globalParamsUnlock;
}
void GlobalParams::setPSDuplex(GBool duplex) {
+ globalParamsLock;
psDuplex = duplex;
+ globalParamsUnlock;
}
void GlobalParams::setPSLevel(PSLevel level) {
+ globalParamsLock;
psLevel = level;
+ globalParamsUnlock;
}
void GlobalParams::setPSEmbedType1(GBool embed) {
+ globalParamsLock;
psEmbedType1 = embed;
+ globalParamsUnlock;
}
void GlobalParams::setPSEmbedTrueType(GBool embed) {
+ globalParamsLock;
psEmbedTrueType = embed;
+ globalParamsUnlock;
}
void GlobalParams::setPSEmbedCIDPostScript(GBool embed) {
+ globalParamsLock;
psEmbedCIDPostScript = embed;
+ globalParamsUnlock;
}
void GlobalParams::setPSEmbedCIDTrueType(GBool embed) {
+ globalParamsLock;
psEmbedCIDTrueType = embed;
+ globalParamsUnlock;
}
void GlobalParams::setPSOPI(GBool opi) {
+ globalParamsLock;
psOPI = opi;
+ globalParamsUnlock;
}
void GlobalParams::setPSASCIIHex(GBool hex) {
+ globalParamsLock;
psASCIIHex = hex;
+ globalParamsUnlock;
}
void GlobalParams::setTextEncoding(char *encodingName) {
+ globalParamsLock;
delete textEncoding;
textEncoding = new GString(encodingName);
+ globalParamsUnlock;
}
GBool GlobalParams::setTextEOL(char *s) {
+ globalParamsLock;
if (!strcmp(s, "unix")) {
textEOL = eolUnix;
} else if (!strcmp(s, "dos")) {
@@ -1061,26 +1314,42 @@ GBool GlobalParams::setTextEOL(char *s) {
} else if (!strcmp(s, "mac")) {
textEOL = eolMac;
} else {
+ globalParamsUnlock;
return gFalse;
}
+ globalParamsUnlock;
return gTrue;
}
void GlobalParams::setTextKeepTinyChars(GBool keep) {
+ globalParamsLock;
textKeepTinyChars = keep;
+ globalParamsUnlock;
}
void GlobalParams::setInitialZoom(char *s) {
+ globalParamsLock;
delete initialZoom;
initialZoom = new GString(s);
+ globalParamsUnlock;
}
GBool GlobalParams::setT1libControl(char *s) {
- return setFontRastControl(&t1libControl, s);
+ GBool ok;
+
+ globalParamsLock;
+ ok = setFontRastControl(&t1libControl, s);
+ globalParamsUnlock;
+ return ok;
}
GBool GlobalParams::setFreeTypeControl(char *s) {
- return setFontRastControl(&freetypeControl, s);
+ GBool ok;
+
+ globalParamsLock;
+ ok = setFontRastControl(&freetypeControl, s);
+ globalParamsUnlock;
+ return ok;
}
GBool GlobalParams::setFontRastControl(FontRastControl *val, char *s) {
@@ -1099,13 +1368,19 @@ GBool GlobalParams::setFontRastControl(FontRastControl *val, char *s) {
}
void GlobalParams::setMapNumericCharNames(GBool map) {
+ globalParamsLock;
mapNumericCharNames = map;
+ globalParamsUnlock;
}
void GlobalParams::setPrintCommands(GBool printCommandsA) {
+ globalParamsLock;
printCommands = printCommandsA;
+ globalParamsUnlock;
}
void GlobalParams::setErrQuiet(GBool errQuietA) {
+ globalParamsLock;
errQuiet = errQuietA;
+ globalParamsUnlock;
}
diff --git a/pdf/xpdf/GlobalParams.h b/pdf/xpdf/GlobalParams.h
index 0f783e8..5fb3be3 100644
--- a/pdf/xpdf/GlobalParams.h
+++ b/pdf/xpdf/GlobalParams.h
@@ -19,6 +19,10 @@
#include "gtypes.h"
#include "CharTypes.h"
+#if MULTITHREADED
+#include "GMutex.h"
+#endif
+
class GString;
class GList;
class GHash;
@@ -137,31 +141,30 @@ public:
FILE *findToUnicodeFile(GString *name);
DisplayFontParam *getDisplayFont(GString *fontName);
DisplayFontParam *getDisplayCIDFont(GString *fontName, GString *collection);
- GString *getPSFile() { return psFile; }
- int getPSPaperWidth() { return psPaperWidth; }
- int getPSPaperHeight() { return psPaperHeight; }
- GBool getPSDuplex() { return psDuplex; }
- PSLevel getPSLevel() { return psLevel; }
+ GString *getPSFile();
+ int getPSPaperWidth();
+ int getPSPaperHeight();
+ GBool getPSDuplex();
+ PSLevel getPSLevel();
PSFontParam *getPSFont(GString *fontName);
PSFontParam *getPSFont16(GString *fontName, GString *collection, int wMode);
- GBool getPSEmbedType1() { return psEmbedType1; }
- GBool getPSEmbedTrueType() { return psEmbedTrueType; }
- GBool getPSEmbedCIDPostScript() { return psEmbedCIDPostScript; }
- GBool getPSEmbedCIDTrueType() { return psEmbedCIDTrueType; }
- GBool getPSOPI() { return psOPI; }
- GBool getPSASCIIHex() { return psASCIIHex; }
- GString *getTextEncodingName() { return textEncoding; }
- EndOfLineKind getTextEOL() { return textEOL; }
- GBool getTextKeepTinyChars() { return textKeepTinyChars; }
+ GBool getPSEmbedType1();
+ GBool getPSEmbedTrueType();
+ GBool getPSEmbedCIDPostScript();
+ GBool getPSEmbedCIDTrueType();
+ GBool getPSOPI();
+ GBool getPSASCIIHex();
+ EndOfLineKind getTextEOL();
+ GBool getTextKeepTinyChars();
GString *findFontFile(GString *fontName, char *ext1, char *ext2);
- GString *getInitialZoom() { return initialZoom; }
- FontRastControl getT1libControl() { return t1libControl; }
- FontRastControl getFreeTypeControl() { return freetypeControl; }
+ GString *getInitialZoom();
+ FontRastControl getT1libControl();
+ FontRastControl getFreeTypeControl();
GString *getURLCommand() { return urlCommand; }
GString *getMovieCommand() { return movieCommand; }
- GBool getMapNumericCharNames() { return mapNumericCharNames; }
- GBool getPrintCommands() { return printCommands; }
- GBool getErrQuiet() { return errQuiet; }
+ GBool getMapNumericCharNames();
+ GBool getPrintCommands();
+ GBool getErrQuiet();
CharCodeToUnicode *getCIDToUnicode(GString *collection);
UnicodeMap *getUnicodeMap(GString *encodingName);
@@ -220,6 +223,7 @@ private:
GList *tokens, GString *fileName, int line);
void parseYesNo(char *cmdName, GBool *flag,
GList *tokens, GString *fileName, int line);
+ UnicodeMap *getUnicodeMap2(GString *encodingName);
GBool setFontRastControl(FontRastControl *val, char *s);
//----- static tables
@@ -281,6 +285,10 @@ private:
CIDToUnicodeCache *cidToUnicodeCache;
UnicodeMapCache *unicodeMapCache;
CMapCache *cMapCache;
+
+#ifdef MULTITHREADED
+ GMutex mutex;
+#endif
};
#endif
diff --git a/pdf/xpdf/Link.cc b/pdf/xpdf/Link.cc
index b16563a..0c3a869 100644
--- a/pdf/xpdf/Link.cc
+++ b/pdf/xpdf/Link.cc
@@ -170,44 +170,52 @@ LinkDest::LinkDest(Array *a) {
// XYZ link
if (obj1.isName("XYZ")) {
- if (a->getLength() != 5) {
- error(-1, "Annotation destination array has wrong length");
- goto err2;
- }
kind = destXYZ;
- a->get(2, &obj2);
- if (obj2.isNull()) {
+ if (a->getLength() < 3) {
changeLeft = gFalse;
- } else if (obj2.isNum()) {
- changeLeft = gTrue;
- left = obj2.getNum();
} else {
- error(-1, "Bad annotation destination position");
- goto err1;
+ a->get(2, &obj2);
+ if (obj2.isNull()) {
+ changeLeft = gFalse;
+ } else if (obj2.isNum()) {
+ changeLeft = gTrue;
+ left = obj2.getNum();
+ } else {
+ error(-1, "Bad annotation destination position");
+ goto err1;
+ }
+ obj2.free();
}
- obj2.free();
- a->get(3, &obj2);
- if (obj2.isNull()) {
+ if (a->getLength() < 4) {
changeTop = gFalse;
- } else if (obj2.isNum()) {
- changeTop = gTrue;
- top = obj2.getNum();
} else {
- error(-1, "Bad annotation destination position");
- goto err1;
+ a->get(3, &obj2);
+ if (obj2.isNull()) {
+ changeTop = gFalse;
+ } else if (obj2.isNum()) {
+ changeTop = gTrue;
+ top = obj2.getNum();
+ } else {
+ error(-1, "Bad annotation destination position");
+ goto err1;
+ }
+ obj2.free();
}
- obj2.free();
- a->get(4, &obj2);
- if (obj2.isNull()) {
+ if (a->getLength() < 5) {
changeZoom = gFalse;
- } else if (obj2.isNum()) {
- changeZoom = gTrue;
- zoom = obj2.getNum();
} else {
- error(-1, "Bad annotation destination position");
- goto err1;
+ a->get(4, &obj2);
+ if (obj2.isNull()) {
+ changeZoom = gFalse;
+ } else if (obj2.isNum()) {
+ changeZoom = gTrue;
+ zoom = obj2.getNum();
+ } else {
+ error(-1, "Bad annotation destination position");
+ goto err1;
+ }
+ obj2.free();
}
- obj2.free();
// Fit link
} else if (obj1.isName("Fit")) {
diff --git a/pdf/xpdf/NameToUnicodeTable.h b/pdf/xpdf/NameToUnicodeTable.h
index 320c558..99bcf1d 100644
--- a/pdf/xpdf/NameToUnicodeTable.h
+++ b/pdf/xpdf/NameToUnicodeTable.h
@@ -684,8 +684,8 @@ static struct {
{0xf6e2, "commasuperior"},
{0x2245, "congruent"},
{0x00a9, "copyright"},
- {0xf8e9, "copyrightsans"},
- {0xf6d9, "copyrightserif"},
+ {0x00a9, "copyrightsans"},
+ {0x00a9, "copyrightserif"},
{0x00a4, "currency"},
{0xf6d1, "cyrBreve"},
{0xf6d2, "cyrFlex"},
@@ -972,8 +972,8 @@ static struct {
{0x2286, "reflexsubset"},
{0x2287, "reflexsuperset"},
{0x00ae, "registered"},
- {0xf8e8, "registersans"},
- {0xf6da, "registerserif"},
+ {0x00ae, "registersans"},
+ {0x00ae, "registerserif"},
{0x2310, "revlogicalnot"},
{0x03c1, "rho"},
{0x02da, "ring"},
@@ -1031,8 +1031,8 @@ static struct {
{0x0303, "tildecomb"},
{0x0384, "tonos"},
{0x2122, "trademark"},
- {0xf8ea, "trademarksans"},
- {0xf6db, "trademarkserif"},
+ {0x2122, "trademarksans"},
+ {0x2122, "trademarkserif"},
{0x25bc, "triagdn"},
{0x25c4, "triaglf"},
{0x25ba, "triagrt"},
diff --git a/pdf/xpdf/Stream.cc b/pdf/xpdf/Stream.cc
index 0d19d4d..b2abef8 100644
--- a/pdf/xpdf/Stream.cc
+++ b/pdf/xpdf/Stream.cc
@@ -467,7 +467,7 @@ GBool StreamPredictor::getNextLine() {
upLeftBuf[1] = upLeftBuf[0];
upLeftBuf[0] = predLine[i];
if ((c = str->getRawChar()) == EOF) {
- break;
+ return gFalse;
}
switch (curPred) {
case 11: // PNG sub
@@ -506,7 +506,6 @@ GBool StreamPredictor::getNextLine() {
}
// apply TIFF (component) predictor
- //~ this is completely untested
if (predictor == 2) {
if (nBits == 1) {
inBuf = predLine[pixBytes - 1];
diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc
index 891752c..b782b42 100644
--- a/pdf/xpdf/TextOutputDev.cc
+++ b/pdf/xpdf/TextOutputDev.cc
@@ -17,8 +17,9 @@
#include <stddef.h>
#include <math.h>
#include <ctype.h>
-#include "GString.h"
#include "gmem.h"
+#include "GString.h"
+#include "GList.h"
#include "config.h"
#include "Error.h"
#include "GlobalParams.h"
@@ -32,103 +33,153 @@
#endif
//------------------------------------------------------------------------
-
-#define textOutSpace 0.2
-#define textOutColSpace 0.2
-
+// parameters
//------------------------------------------------------------------------
-struct TextOutColumnEdge {
- double x, y0, y1;
-};
+// Minium and maximum inter-word spacing (as a fraction of the average
+// character width).
+#define wordMinSpaceWidth 0.3
+#define wordMaxSpaceWidth 2.0
+
+// Default min and max inter-word spacing (when the average character
+// width is unknown).
+#define wordDefMinSpaceWidth 0.2
+#define wordDefMaxSpaceWidth 1.5
+
+// Max difference in x,y coordinates (as a fraction of the font size)
+// allowed for duplicated text (fake boldface, drop shadows) which is
+// to be discarded.
+#define dupMaxDeltaX 0.2
+#define dupMaxDeltaY 0.2
+
+// Min overlap (as a fraction of the font size) required for two
+// lines to be considered vertically overlapping.
+#define lineOverlapSlack 0.5
+
+// Max difference in baseline y coordinates (as a fraction of the font
+// size) allowed for words which are to be grouped into a line, not
+// including sub/superscripts.
+#define lineMaxBaselineDelta 0.1
+
+// Max ratio of font sizes allowed for words which are to be grouped
+// into a line, not including sub/superscripts.
+#define lineMaxFontSizeRatio 1.4
+
+// Min spacing (as a fraction of the font size) allowed between words
+// which are to be grouped into a line.
+#define lineMinDeltaX -0.5
+
+// Minimum vertical overlap (as a fraction of the font size) required
+// for superscript and subscript words.
+#define lineMinSuperscriptOverlap 0.3
+#define lineMinSubscriptOverlap 0.3
+
+// Min/max ratio of font sizes allowed for sub/superscripts compared to
+// the base text.
+#define lineMinSubscriptFontSizeRatio 0.4
+#define lineMaxSubscriptFontSizeRatio 1.01
+#define lineMinSuperscriptFontSizeRatio 0.4
+#define lineMaxSuperscriptFontSizeRatio 1.01
+
+// Max horizontal spacing (as a fraction of the font size) allowed
+// before sub/superscripts.
+#define lineMaxSubscriptDeltaX 0.2
+#define lineMaxSuperscriptDeltaX 0.2
+
+// Maximum vertical spacing (as a fraction of the font size) allowed
+// for lines which are to be grouped into a block.
+#define blkMaxSpacing 2.0
+
+// Max ratio of primary font sizes allowed for lines which are to be
+// grouped into a block.
+#define blkMaxFontSizeRatio 1.3
+
+// Min overlap (as a fraction of the font size) required for two
+// blocks to be considered vertically overlapping.
+#define blkOverlapSlack 0.5
+
+// Max vertical spacing (as a fraction of the font size) allowed
+// between blocks which are 'adjacent' when sorted by reading order.
+#define blkMaxSortSpacing 2.0
+
+// Max vertical offset (as a fraction of the font size) of the top and
+// bottom edges allowed for blocks which are to be grouped into a
+// flow.
+#define flowMaxDeltaY 1.0
//------------------------------------------------------------------------
-// TextBlock
+// TextFontInfo
//------------------------------------------------------------------------
-class TextBlock {
-public:
-
- TextBlock();
- ~TextBlock();
-
- double xMin, xMax;
- double yMin, yMax;
- TextString *strings; // list of strings in the block
- TextBlock *next; // next block in line
- TextBlock *xyNext; // next block on xyBlocks list
- Unicode *text; // Unicode text of the block, including
- // spaces between strings
- double *xRight; // right-hand x coord of each char
- int len; // total number of Unicode characters
- int convertedLen; // total number of converted characters
- int *col; // starting column number for each
- // Unicode character
-};
-
-TextBlock::TextBlock() {
- strings = NULL;
- next = NULL;
- xyNext = NULL;
- text = NULL;
- xRight = NULL;
- col = NULL;
-}
-
-TextBlock::~TextBlock() {
- TextString *p1, *p2;
+TextFontInfo::TextFontInfo(GfxState *state) {
+ double *textMat;
+ double t1, t2, avgWidth, w;
+ int n, i;
- for (p1 = strings; p1; p1 = p2) {
- p2 = p1->next;
- delete p1;
+ gfxFont = state->getFont();
+ textMat = state->getTextMat();
+ horizScaling = state->getHorizScaling();
+ if ((t1 = fabs(textMat[0])) > 0.01 &&
+ (t2 = fabs(textMat[3])) > 0.01) {
+ horizScaling *= t1 / t2;
}
- gfree(text);
- gfree(xRight);
- gfree(col);
-}
-
-//------------------------------------------------------------------------
-// TextLine
-//------------------------------------------------------------------------
-
-class TextLine {
-public:
- TextLine();
- ~TextLine();
+ if (!gfxFont) {
+ minSpaceWidth = horizScaling * wordDefMinSpaceWidth;
+ maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth;
+ } else if (gfxFont->isCIDFont()) {
+ //~ handle 16-bit fonts
+ minSpaceWidth = horizScaling * wordDefMinSpaceWidth;
+ maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth;
+ } else {
+ avgWidth = 0;
+ n = 0;
+ for (i = 0; i < 256; ++i) {
+ w = ((Gfx8BitFont *)gfxFont)->getWidth(i);
+ if (w > 0) {
+ avgWidth += w;
+ ++n;
+ }
+ }
+ avgWidth /= n;
+ minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth;
+ maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth;
+ }
- TextBlock *blocks;
- TextLine *next;
- double yMin, yMax;
-};
+}
-TextLine::TextLine() {
- blocks = NULL;
- next = NULL;
+TextFontInfo::~TextFontInfo() {
}
-TextLine::~TextLine() {
- TextBlock *p1, *p2;
+GBool TextFontInfo::matches(GfxState *state) {
+ double *textMat;
+ double t1, t2, h;
- for (p1 = blocks; p1; p1 = p2) {
- p2 = p1->next;
- delete p1;
+ textMat = state->getTextMat();
+ h = state->getHorizScaling();
+ if ((t1 = fabs(textMat[0])) > 0.01 &&
+ (t2 = fabs(textMat[3])) > 0.01) {
+ h *= t1 / t2;
}
+ return state->getFont() == gfxFont &&
+ fabs(h - horizScaling) < 0.01;
}
//------------------------------------------------------------------------
-// TextString
+// TextWord
//------------------------------------------------------------------------
-TextString::TextString(GfxState *state, double x0, double y0,
- double fontSize) {
- GfxFont *font;
+TextWord::TextWord(GfxState *state, double x0, double y0,
+ TextFontInfo *fontA, double fontSizeA) {
+ GfxFont *gfxFont;
double x, y;
+ font = fontA;
+ fontSize = fontSizeA;
state->transform(x0, y0, &x, &y);
- if ((font = state->getFont())) {
- yMin = y - font->getAscent() * fontSize;
- yMax = y - font->getDescent() * fontSize;
+ if ((gfxFont = font->gfxFont)) {
+ yMin = y - gfxFont->getAscent() * fontSize;
+ yMax = y - gfxFont->getDescent() * fontSize;
} else {
// this means that the PDF file draws text without a current font,
// which should never happen
@@ -141,21 +192,23 @@ TextString::TextString(GfxState *state, double x0, double y0,
yMin = y;
yMax = y + 1;
}
- marked = gFalse;
+ yBase = y;
text = NULL;
xRight = NULL;
len = size = 0;
+ spaceAfter = gFalse;
next = NULL;
+
}
-TextString::~TextString() {
+TextWord::~TextWord() {
gfree(text);
gfree(xRight);
}
-void TextString::addChar(GfxState *state, double x, double y,
- double dx, double dy, Unicode u) {
+void TextWord::addChar(GfxState *state, double x, double y,
+ double dx, double dy, Unicode u) {
if (len == size) {
size += 16;
text = (Unicode *)grealloc(text, size * sizeof(Unicode));
@@ -169,35 +222,249 @@ void TextString::addChar(GfxState *state, double x, double y,
++len;
}
+// Returns true if <this> comes before <word2> in xy order.
+GBool TextWord::xyBefore(TextWord *word2) {
+ return xMin < word2->xMin ||
+ (xMin == word2->xMin && yMin < word2->yMin);
+}
+
+// Merge another word onto the end of this one.
+void TextWord::merge(TextWord *word2) {
+ int i;
+
+ xMax = word2->xMax;
+ if (word2->yMin < yMin) {
+ yMin = word2->yMin;
+ }
+ if (word2->yMax > yMax) {
+ yMax = word2->yMax;
+ }
+ if (len + word2->len > size) {
+ size = len + word2->len;
+ text = (Unicode *)grealloc(text, size * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, size * sizeof(double));
+ }
+ for (i = 0; i < word2->len; ++i) {
+ text[len + i] = word2->text[i];
+ xRight[len + i] = word2->xRight[i];
+ }
+ len += word2->len;
+}
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+TextLine::TextLine() {
+ words = NULL;
+ text = NULL;
+ xRight = NULL;
+ col = NULL;
+ len = 0;
+ hyphenated = gFalse;
+ pageNext = NULL;
+ next = NULL;
+ flowNext = NULL;
+}
+
+TextLine::~TextLine() {
+ TextWord *w1, *w2;
+
+ for (w1 = words; w1; w1 = w2) {
+ w2 = w1->next;
+ delete w1;
+ }
+ gfree(text);
+ gfree(xRight);
+ gfree(col);
+}
+
+// Returns true if <this> comes before <line2> in yx order, allowing
+// slack for vertically overlapping lines.
+GBool TextLine::yxBefore(TextLine *line2) {
+ double dy;
+
+ dy = lineOverlapSlack * fontSize;
+
+ // non-overlapping case
+ if (line2->yMin > yMax - dy ||
+ line2->yMax < yMin + dy) {
+ return yMin < line2->yMin ||
+ (yMin == line2->yMin && xMin < line2->xMin);
+ }
+
+ // overlapping case
+ return xMin < line2->xMin;
+}
+
+// Merge another line's words onto the end of this line.
+void TextLine::merge(TextLine *line2) {
+ TextWord *word;
+ int newLen, i;
+
+ xMax = line2->xMax;
+ if (line2->yMin < yMin) {
+ yMin = line2->yMin;
+ }
+ if (line2->yMax > yMax) {
+ yMax = line2->yMax;
+ }
+ xSpaceR = line2->xSpaceR;
+ for (word = words; word->next; word = word->next) ;
+ word->spaceAfter = gTrue;
+ word->next = line2->words;
+ line2->words = NULL;
+ newLen = len + 1 + line2->len;
+ text = (Unicode *)grealloc(text, newLen * sizeof(Unicode));
+ xRight = (double *)grealloc(xRight, newLen * sizeof(double));
+ text[len] = (Unicode)0x0020;
+ xRight[len] = line2->xMin;
+ for (i = 0; i < line2->len; ++i) {
+ text[len + 1 + i] = line2->text[i];
+ xRight[len + 1 + i] = line2->xRight[i];
+ }
+ len = newLen;
+ convertedLen += line2->convertedLen;
+ hyphenated = line2->hyphenated;
+}
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+TextBlock::TextBlock() {
+ lines = NULL;
+ next = NULL;
+}
+
+TextBlock::~TextBlock() {
+ TextLine *l1, *l2;
+
+ for (l1 = lines; l1; l1 = l2) {
+ l2 = l1->next;
+ delete l1;
+ }
+}
+
+// Returns true if <this> comes before <blk2> in xy order, allowing
+// slack for vertically overlapping blocks.
+GBool TextBlock::yxBefore(TextBlock *blk2) {
+ double dy;
+
+ dy = blkOverlapSlack * lines->fontSize;
+
+ // non-overlapping case
+ if (blk2->yMin > yMax - dy ||
+ blk2->yMax < yMin + dy) {
+ return yMin < blk2->yMin ||
+ (yMin == blk2->yMin && xMin < blk2->xMin);
+ }
+
+ // overlapping case
+ return xMin < blk2->xMin;
+}
+
+// Merge another block's line onto the right of this one.
+void TextBlock::mergeRight(TextBlock *blk2) {
+ lines->merge(blk2->lines);
+ xMax = lines->xMax;
+ yMin = lines->yMin;
+ yMax = lines->yMax;
+ xSpaceR = lines->xSpaceR;
+}
+
+// Merge another block's lines onto the bottom of this block.
+void TextBlock::mergeBelow(TextBlock *blk2) {
+ TextLine *line;
+
+ if (blk2->xMin < xMin) {
+ xMin = blk2->xMin;
+ }
+ if (blk2->xMax > xMax) {
+ xMax = blk2->xMax;
+ }
+ yMax = blk2->yMax;
+ if (blk2->xSpaceL > xSpaceL) {
+ xSpaceL = blk2->xSpaceL;
+ }
+ if (blk2->xSpaceR < xSpaceR) {
+ xSpaceR = blk2->xSpaceR;
+ }
+ if (blk2->maxFontSize > maxFontSize) {
+ maxFontSize = blk2->maxFontSize;
+ }
+ for (line = lines; line->next; line = line->next) ;
+ line->next = line->flowNext = blk2->lines;
+ blk2->lines = NULL;
+}
+
+//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+TextFlow::TextFlow() {
+ blocks = NULL;
+ next = NULL;
+}
+
+TextFlow::~TextFlow() {
+ TextBlock *b1, *b2;
+
+ for (b1 = blocks; b1; b1 = b2) {
+ b2 = b1->next;
+ delete b1;
+ }
+}
+
+
//------------------------------------------------------------------------
// TextPage
//------------------------------------------------------------------------
TextPage::TextPage(GBool rawOrderA) {
rawOrder = rawOrderA;
- curStr = NULL;
+ curWord = NULL;
+ font = NULL;
fontSize = 0;
- xyStrings = NULL;
- xyCur1 = xyCur2 = NULL;
- lines = NULL;
nest = 0;
nTinyChars = 0;
+ words = wordPtr = NULL;
+ lines = NULL;
+ flows = NULL;
+ fonts = new GList();
}
TextPage::~TextPage() {
clear();
+ delete fonts;
}
void TextPage::updateFont(GfxState *state) {
- GfxFont *font;
+ GfxFont *gfxFont;
double *fm;
char *name;
int code, mCode, letterCode, anyCode;
double w;
+ int i;
+
+ // get the font info object
+ font = NULL;
+ for (i = 0; i < fonts->getLength(); ++i) {
+ font = (TextFontInfo *)fonts->get(i);
+ if (font->matches(state)) {
+ break;
+ }
+ font = NULL;
+ }
+ if (!font) {
+ font = new TextFontInfo(state);
+ fonts->append(font);
+ }
// adjust the font size
+ gfxFont = state->getFont();
fontSize = state->getTransformedFontSize();
- if ((font = state->getFont()) && font->getType() == fontType3) {
+ if (gfxFont && gfxFont->getType() == fontType3) {
// This is a hack which makes it possible to deal with some Type 3
// fonts. The problem is that it's impossible to know what the
// base coordinate system used in the font is without actually
@@ -206,7 +473,7 @@ void TextPage::updateFont(GfxState *state) {
// subset that doesn't contain 'm').
mCode = letterCode = anyCode = -1;
for (code = 0; code < 256; ++code) {
- name = ((Gfx8BitFont *)font)->getCharName(code);
+ name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
if (name && name[0] == 'm' && name[1] == '\0') {
mCode = code;
}
@@ -215,647 +482,1202 @@ void TextPage::updateFont(GfxState *state) {
(name[0] >= 'a' && name[0] <= 'z'))) {
letterCode = code;
}
- if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) {
+ if (anyCode < 0 && name &&
+ ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
anyCode = code;
}
}
if (mCode >= 0 &&
- (w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) {
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
// 0.6 is a generic average 'm' width -- yes, this is a hack
fontSize *= w / 0.6;
} else if (letterCode >= 0 &&
- (w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) {
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
// even more of a hack: 0.5 is a generic letter width
fontSize *= w / 0.5;
} else if (anyCode >= 0 &&
- (w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) {
+ (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
// better than nothing: 0.5 is a generic character width
fontSize *= w / 0.5;
}
- fm = font->getFontMatrix();
+ fm = gfxFont->getFontMatrix();
if (fm[0] != 0) {
fontSize *= fabs(fm[3] / fm[0]);
}
}
}
-void TextPage::beginString(GfxState *state, double x0, double y0) {
+void TextPage::beginWord(GfxState *state, double x0, double y0) {
// This check is needed because Type 3 characters can contain
- // text-drawing operations.
- if (curStr) {
+ // text-drawing operations (when TextPage is being used via
+ // XOutputDev rather than TextOutputDev).
+ if (curWord) {
++nest;
return;
}
- curStr = new TextString(state, x0, y0, fontSize);
+ curWord = new TextWord(state, x0, y0, font, fontSize);
}
void TextPage::addChar(GfxState *state, double x, double y,
- double dx, double dy, Unicode *u, int uLen) {
- double x1, y1, w1, h1, dx2, dy2;
+ double dx, double dy,
+ CharCode c, Unicode *u, int uLen) {
+ double x1, y1, w1, h1, dx2, dy2, sp;
int n, i;
+ // if the previous char was a space, addChar will have called
+ // endWord, so we need to start a new word
+ if (!curWord) {
+ beginWord(state, x, y);
+ }
+
+ // throw away chars that aren't inside the page bounds
state->transform(x, y, &x1, &y1);
- if (x1 < 0 || x1 > state->getPageWidth() ||
- y1 < 0 || y1 > state->getPageHeight()) {
+ if (x1 < 0 || x1 > pageWidth ||
+ y1 < 0 || y1 > pageHeight) {
return;
}
- state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(),
- 0, &dx2, &dy2);
+
+ // subtract char and word spacing from the dx,dy values
+ sp = state->getCharSpace();
+ if (c == (CharCode)0x20) {
+ sp += state->getWordSpace();
+ }
+ state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
dx -= dx2;
dy -= dy2;
state->transformDelta(dx, dy, &w1, &h1);
+
+ // check the tiny chars limit
if (!globalParams->getTextKeepTinyChars() &&
fabs(w1) < 3 && fabs(h1) < 3) {
if (++nTinyChars > 20000) {
return;
}
}
- n = curStr->len;
- if (n > 0 && x1 - curStr->xRight[n-1] >
- 0.1 * (curStr->yMax - curStr->yMin)) {
- // large char spacing is sometimes used to move text around
- endString();
- beginString(state, x, y);
- }
- if (uLen == 1 && u[0] == (Unicode)0x20 &&
- w1 > 0.5 * (curStr->yMax - curStr->yMin)) {
- // large word spacing is sometimes used to move text around
+
+ // break words at space character
+ if (uLen == 1 && u[0] == (Unicode)0x20) {
+ endWord();
return;
}
+
+ // large char spacing is sometimes used to move text around -- in
+ // this case, break text into individual chars and let the coalesce
+ // function deal with it later
+ n = curWord->len;
+ if (n > 0 && x1 - curWord->xRight[n-1] >
+ curWord->font->minSpaceWidth * curWord->fontSize) {
+ // large char spacing is sometimes used to move text around
+ endWord();
+ beginWord(state, x, y);
+ }
+
+ // add the characters to the current word
if (uLen != 0) {
w1 /= uLen;
h1 /= uLen;
}
for (i = 0; i < uLen; ++i) {
- curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
+ curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
}
}
-void TextPage::endString() {
+void TextPage::endWord() {
// This check is needed because Type 3 characters can contain
- // text-drawing operations.
+ // text-drawing operations (when TextPage is being used via
+ // XOutputDev rather than TextOutputDev).
if (nest > 0) {
--nest;
return;
}
- addString(curStr);
- curStr = NULL;
+ if (curWord) {
+ addWord(curWord);
+ curWord = NULL;
+ }
}
-void TextPage::addString(TextString *str) {
- TextString *p1, *p2;
+void TextPage::addWord(TextWord *word) {
+ TextWord *p1, *p2;
- // throw away zero-length strings -- they don't have valid xMin/xMax
+ // throw away zero-length words -- they don't have valid xMin/xMax
// values, and they're useless anyway
- if (str->len == 0) {
- delete str;
+ if (word->len == 0) {
+ delete word;
return;
}
- // insert string in xy list
+ // insert word in xy list
if (rawOrder) {
- p1 = xyCur1;
+ p1 = wordPtr;
p2 = NULL;
- } else if ((!xyCur1 || xyBefore(xyCur1, str)) &&
- (!xyCur2 || xyBefore(str, xyCur2))) {
- p1 = xyCur1;
- p2 = xyCur2;
- } else if (xyCur1 && xyBefore(xyCur1, str)) {
- for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) {
- if (xyBefore(str, p2)) {
- break;
- }
- }
- xyCur2 = p2;
} else {
- for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) {
- if (xyBefore(str, p2)) {
+ if (wordPtr && wordPtr->xyBefore(word)) {
+ p1 = wordPtr;
+ p2 = wordPtr->next;
+ } else {
+ p1 = NULL;
+ p2 = words;
+ }
+ for (; p2; p1 = p2, p2 = p2->next) {
+ if (word->xyBefore(p2)) {
break;
}
}
- xyCur2 = p2;
}
- xyCur1 = str;
if (p1) {
- p1->next = str;
+ p1->next = word;
} else {
- xyStrings = str;
+ words = word;
}
- str->next = p2;
+ word->next = p2;
+ wordPtr = word;
}
void TextPage::coalesce() {
- TextLine *line, *line0;
- TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2;
- TextString *str0, *str1, *str2, *str3, *str4;
- TextString *str1prev, *str2prev, *str3prev;
- TextOutColumnEdge *edges;
+ TextWord *word0, *word1, *word2, *word3, *word4;
+ TextLine *line0, *line1, *line2, *line3, *line4, *lineList;
+ TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6;
+ TextBlock *yxBlocks, *blocks, *blkStack;
+ TextFlow *flow0, *flow1;
+ double sz, xLimit, minSpace, maxSpace, yLimit;
+ double fit1, fit2;
+ GBool found;
UnicodeMap *uMap;
GBool isUnicode;
char buf[8];
- int edgesLength, edgesSize;
- double x, yMin, yMax;
- double space, fit1, fit2, h;
- int col1, col2, d;
- int i, j;
-
-#if 0 //~ for debugging
- for (str1 = xyStrings; str1; str1 = str1->next) {
- printf("x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
- str1->xMin, str1->xMax, str1->yMin, str1->yMax,
- (str1->yMax - str1->yMin));
- for (i = 0; i < str1->len; ++i) {
- fputc(str1->text[i] & 0xff, stdout);
+ int col1, col2, d, i, j;
+
+#if 0 // for debugging
+ printf("*** initial word list ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
}
printf("'\n");
}
- printf("\n------------------------------------------------------------\n\n");
+ printf("\n");
+ fflush(stdout);
#endif
- // build the list of column edges
- edges = NULL;
- edgesLength = edgesSize = 0;
- if (!rawOrder) {
- for (str1prev = NULL, str1 = xyStrings;
- str1;
- str1prev = str1, str1 = str1->next) {
- if (str1->marked) {
- continue;
- }
- h = str1->yMax - str1->yMin;
- if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) {
- continue;
- }
- x = str1->xMin;
- yMin = str1->yMin;
- yMax = str1->yMax;
- for (str2prev = str1, str2 = str1->next;
- str2;
- str2prev = str2, str2 = str2->next) {
- h = str2->yMax - str2->yMin;
- if (!str2->marked &&
- (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
- fabs(str2->xMin - x) < 0.5 &&
- str2->yMin - yMax < 0.3 * h &&
- yMin - str2->yMax < 0.3 * h) {
- break;
- }
- }
- if (str2) {
- if (str2->yMin < yMin) {
- yMin = str2->yMin;
- }
- if (str2->yMax > yMax) {
- yMax = str2->yMax;
- }
- str2->marked = gTrue;
- for (str3prev = str1, str3 = str1->next;
- str3;
- str3prev = str3, str3 = str3->next) {
- h = str3->yMax - str3->yMin;
- if (!str3->marked &&
- (str3->xMin - str3prev->xMax) / h > textOutColSpace &&
- fabs(str3->xMin - x) < 0.5 &&
- str3->yMin - yMax < 0.3 * h &&
- yMin - str3->yMax < 0.3 * h) {
- break;
- }
- }
- if (str3) {
- if (str3->yMin < yMin) {
- yMin = str3->yMin;
- }
- if (str3->yMax > yMax) {
- yMax = str3->yMax;
- }
- str3->marked = gTrue;
- do {
- for (str2prev = str1, str2 = str1->next;
- str2;
- str2prev = str2, str2 = str2->next) {
- h = str2->yMax - str2->yMin;
- if (!str2->marked &&
- (str2->xMin - str2prev->xMax) / h > textOutColSpace &&
- fabs(str2->xMin - x) < 0.5 &&
- str2->yMin - yMax < 0.3 * h &&
- yMin - str2->yMax < 0.3 * h) {
- if (str2->yMin < yMin) {
- yMin = str2->yMin;
- }
- if (str2->yMax > yMax) {
- yMax = str2->yMax;
- }
- str2->marked = gTrue;
- break;
- }
- }
- } while (str2);
- if (edgesLength == edgesSize) {
- edgesSize = edgesSize ? 2 * edgesSize : 16;
- edges = (TextOutColumnEdge *)
- grealloc(edges, edgesSize * sizeof(TextOutColumnEdge));
- }
- edges[edgesLength].x = x;
- edges[edgesLength].y0 = yMin;
- edges[edgesLength].y1 = yMax;
- ++edgesLength;
- } else {
- str2->marked = gFalse;
- }
+ //----- discard duplicated text (fake boldface, drop shadows)
+
+ word0 = words;
+ while (word0) {
+ sz = word0->fontSize;
+ xLimit = word0->xMin + sz * dupMaxDeltaX;
+ found = gFalse;
+ for (word1 = word0, word2 = word0->next;
+ word2 && word2->xMin < xLimit;
+ word1 = word2, word2 = word2->next) {
+ if (word2->len == word0->len &&
+ !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) &&
+ fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY &&
+ fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY &&
+ fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) {
+ found = gTrue;
+ break;
}
- str1->marked = gTrue;
+ }
+ if (found) {
+ word1->next = word2->next;
+ delete word2;
+ } else {
+ word0 = word0->next;
}
}
-#if 0 //~ for debugging
- printf("column edges:\n");
- for (i = 0; i < edgesLength; ++i) {
- printf("%d: x=%.2f y0=%.2f y1=%.2f\n",
- i, edges[i].x, edges[i].y0, edges[i].y1);
+#if 0 // for debugging
+ printf("*** words after removing duplicate text ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
}
- printf("\n------------------------------------------------------------\n\n");
+ printf("\n");
+ fflush(stdout);
#endif
- // build the blocks
- yxBlocks = NULL;
- blk1 = blk2 = NULL;
- while (xyStrings) {
-
- // build the block
- str0 = xyStrings;
- xyStrings = xyStrings->next;
- str0->next = NULL;
- blk = new TextBlock();
- blk->strings = str0;
- blk->xMin = str0->xMin;
- blk->xMax = str0->xMax;
- blk->yMin = str0->yMin;
- blk->yMax = str0->yMax;
- while (xyStrings) {
- str1 = NULL;
- str2 = xyStrings;
- fit1 = coalesceFit(str0, str2);
- if (!rawOrder) {
- // look for best-fitting string
- space = str0->yMax - str0->yMin;
- for (str3 = xyStrings, str4 = xyStrings->next;
- str4 && str4->xMin - str0->xMax <= space;
- str3 = str4, str4 = str4->next) {
- fit2 = coalesceFit(str0, str4);
- if (fit2 < fit1) {
- str1 = str3;
- str2 = str4;
- fit1 = fit2;
- }
- }
- }
- if (fit1 > 1) {
- // no fit - we're done with this block
- break;
- }
-
- // if we've hit a column edge we're done with this block
- if (fit1 > 0.2) {
- for (i = 0; i < edgesLength; ++i) {
- if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin &&
- str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 &&
- str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) {
- break;
- }
- }
- if (i < edgesLength) {
+ //----- merge words
+
+ word0 = words;
+ while (word0) {
+ sz = word0->fontSize;
+
+ // look for adjacent text which is part of the same word, and
+ // merge it into this word
+ xLimit = word0->xMax + sz * word0->font->minSpaceWidth;
+ if (rawOrder) {
+ word1 = word0;
+ word2 = word0->next;
+ found = word2 &&
+ word2->xMin < xLimit &&
+ word2->font == word0->font &&
+ fabs(word2->fontSize - sz) < 0.05 &&
+ fabs(word2->yBase - word0->yBase) < 0.05;
+ } else {
+ found = gFalse;
+ for (word1 = word0, word2 = word0->next;
+ word2 && word2->xMin < xLimit;
+ word1 = word2, word2 = word2->next) {
+ if (word2->font == word0->font &&
+ fabs(word2->fontSize - sz) < 0.05 &&
+ fabs(word2->yBase - word0->yBase) < 0.05) {
+ found = gTrue;
break;
}
}
-
- if (str1) {
- str1->next = str2->next;
- } else {
- xyStrings = str2->next;
- }
- str0->next = str2;
- str2->next = NULL;
- if (str2->xMax > blk->xMax) {
- blk->xMax = str2->xMax;
- }
- if (str2->yMin < blk->yMin) {
- blk->yMin = str2->yMin;
- }
- if (str2->yMax > blk->yMax) {
- blk->yMax = str2->yMax;
- }
- str0 = str2;
- }
-
- // insert block on list
- if (!rawOrder) {
- // insert block on list in yx order
- for (blk1 = NULL, blk2 = yxBlocks;
- blk2 && !yxBefore(blk, blk2);
- blk1 = blk2, blk2 = blk2->next) ;
}
- blk->next = blk2;
- if (blk1) {
- blk1->next = blk;
- } else {
- yxBlocks = blk;
+ if (found) {
+ word0->merge(word2);
+ word1->next = word2->next;
+ delete word2;
+ continue;
}
- blk1 = blk;
+
+ word0 = word0->next;
}
- gfree(edges);
+#if 0 // for debugging
+ printf("*** after merging words ***\n");
+ for (word0 = words; word0; word0 = word0->next) {
+ printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
- // the strings are now owned by the lines/blocks tree
- xyStrings = NULL;
+ //----- assemble words into lines
- // build the block text
uMap = globalParams->getTextEncoding();
isUnicode = uMap ? uMap->isUnicode() : gFalse;
- for (blk = yxBlocks; blk; blk = blk->next) {
- blk->len = 0;
- for (str1 = blk->strings; str1; str1 = str1->next) {
- blk->len += str1->len;
- if (str1->next && str1->next->xMin - str1->xMax >
- textOutSpace * (str1->yMax - str1->yMin)) {
- str1->spaceAfter = gTrue;
- ++blk->len;
+
+ lineList = NULL;
+ line0 = NULL;
+ while (words) {
+
+ // build a new line object
+ word0 = words;
+ words = words->next;
+ word0->next = NULL;
+ line1 = new TextLine();
+ line1->words = word0;
+ line1->xMin = word0->xMin;
+ line1->xMax = word0->xMax;
+ line1->yMin = word0->yMin;
+ line1->yMax = word0->yMax;
+ line1->yBase = word0->yBase;
+ line1->font = word0->font;
+ line1->fontSize = word0->fontSize;
+ line1->len = word0->len;
+ minSpace = line1->fontSize * word0->font->minSpaceWidth;
+ maxSpace = line1->fontSize * word0->font->maxSpaceWidth;
+
+ // find subsequent words in the line
+ while (words) {
+ xLimit = line1->xMax + maxSpace;
+ fit1 = fit2 = 0;
+ word3 = word4 = NULL;
+ if (rawOrder) {
+ if (words &&
+ words->xMin < xLimit &&
+ ((fit1 = lineFit(line1, word0, words)) >= 0)) {
+ word3 = NULL;
+ word4 = words;
+ }
+ } else {
+ for (word1 = NULL, word2 = words;
+ word2 && word2->xMin < xLimit;
+ word1 = word2, word2 = word2->next) {
+ fit2 = lineFit(line1, word0, word2);
+ if (fit2 >= 0 && (!word4 ||
+ (word4 && fit2 < fit1))) {
+ fit1 = fit2;
+ word3 = word1;
+ word4 = word2;
+ }
+ }
+ }
+ if (word4) {
+ if (word3) {
+ word3->next = word4->next;
+ } else {
+ words = word4->next;
+ }
+ word0->next = word4;
+ word4->next = NULL;
+ if (word4->xMax > line1->xMax) {
+ line1->xMax = word4->xMax;
+ }
+ if (word4->yMin < line1->yMin) {
+ line1->yMin = word4->yMin;
+ }
+ if (word4->yMax > line1->yMax) {
+ line1->yMax = word4->yMax;
+ }
+ line1->len += word4->len;
+ if (fit1 > minSpace) {
+ word0->spaceAfter = gTrue;
+ ++line1->len;
+ }
+ word0 = word4;
} else {
- str1->spaceAfter = gFalse;
+ break;
}
}
- blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode));
- blk->xRight = (double *)gmalloc(blk->len * sizeof(double));
- blk->col = (int *)gmalloc(blk->len * sizeof(int));
+
+ // build the line text
+ line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode));
+ line1->xRight = (double *)gmalloc(line1->len * sizeof(double));
+ line1->col = (int *)gmalloc(line1->len * sizeof(int));
i = 0;
- for (str1 = blk->strings; str1; str1 = str1->next) {
- for (j = 0; j < str1->len; ++j) {
- blk->text[i] = str1->text[j];
- blk->xRight[i] = str1->xRight[j];
+ for (word1 = line1->words; word1; word1 = word1->next) {
+ for (j = 0; j < word1->len; ++j) {
+ line1->text[i] = word1->text[j];
+ line1->xRight[i] = word1->xRight[j];
++i;
}
- if (str1->spaceAfter) {
- blk->text[i] = (Unicode)0x0020;
- blk->xRight[i] = str1->next->xMin;
+ if (word1->spaceAfter && word1->next) {
+ line1->text[i] = (Unicode)0x0020;
+ line1->xRight[i] = word1->next->xMin;
++i;
}
}
- blk->convertedLen = 0;
- for (j = 0; j < blk->len; ++j) {
- blk->col[j] = blk->convertedLen;
+ line1->convertedLen = 0;
+ for (j = 0; j < line1->len; ++j) {
+ line1->col[j] = line1->convertedLen;
if (isUnicode) {
- ++blk->convertedLen;
+ ++line1->convertedLen;
} else if (uMap) {
- blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf));
+ line1->convertedLen +=
+ uMap->mapUnicode(line1->text[j], buf, sizeof(buf));
}
}
+
+ // check for hyphen at end of line
+ //~ need to check for other chars used as hyphens
+ if (line1->text[line1->len - 1] == (Unicode)'-') {
+ line1->hyphenated = gTrue;
+ }
+
+ // insert line on list
+ if (line0) {
+ line0->next = line1;
+ } else {
+ lineList = line1;
+ }
+ line0 = line1;
}
+
if (uMap) {
uMap->decRefCnt();
}
-#if 0 //~ for debugging
- for (blk = yxBlocks; blk; blk = blk->next) {
- printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
- blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
- TextString *str;
- for (str = blk->strings; str; str = str->next) {
- printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f'",
- str->xMin, str->xMax, str->yMin, str->yMax,
- (str->yMax - str->yMin));
- for (i = 0; i < str->len; ++i) {
- fputc(str->text[i] & 0xff, stdout);
- }
- if (str->spaceAfter) {
- fputc(' ', stdout);
+#if 0 // for debugging
+ printf("*** lines in xy order ***\n");
+ for (line0 = lineList; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
}
printf("'\n");
}
}
- printf("\n------------------------------------------------------------\n\n");
+ printf("\n");
+ fflush(stdout);
#endif
- // build the lines
- lines = NULL;
- line0 = NULL;
- while (yxBlocks) {
- blk0 = yxBlocks;
- yxBlocks = yxBlocks->next;
- blk0->next = NULL;
- line = new TextLine();
- line->blocks = blk0;
- line->yMin = blk0->yMin;
- line->yMax = blk0->yMax;
- while (yxBlocks) {
+ //----- column assignment
- // remove duplicated text (fake boldface, shadowed text)
- h = blk0->yMax - blk0->yMin;
- if (yxBlocks->len == blk0->len &&
- !memcmp(yxBlocks->text, blk0->text,
- yxBlocks->len * sizeof(Unicode)) &&
- fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 &&
- fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 &&
- fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 &&
- fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) {
- blk1 = yxBlocks;
- yxBlocks = yxBlocks->next;
- delete blk1;
- continue;
+ for (line1 = lineList; line1; line1 = line1->next) {
+ col1 = 0;
+ for (line2 = lineList; line2 != line1; line2 = line2->next) {
+ if (line1->xMin >= line2->xMax) {
+ d = (int)((line1->xMin - line2->xMax) /
+ (line1->font->maxSpaceWidth * line1->fontSize));
+ if (d > 4) {
+ d = 4;
+ }
+ col2 = line2->col[0] + line2->convertedLen + d;
+ if (col2 > col1) {
+ col1 = col2;
+ }
+ } else if (line1->xMin > line2->xMin) {
+ for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ;
+ col2 = line2->col[i];
+ if (col2 > col1) {
+ col1 = col2;
+ }
}
+ }
+ for (j = 0; j < line1->len; ++j) {
+ line1->col[j] += col1;
+ }
+ }
- if (rawOrder && yxBlocks->yMax < blk0->yMin) {
- break;
+ //----- assemble lines into blocks
+
+ if (rawOrder) {
+
+ lines = lineList;
+ for (line1 = lines; line1; line1 = line1->next) {
+ line1->xSpaceL = 0;
+ line1->xSpaceR = pageWidth;
+ }
+
+ } else {
+
+ // sort lines into yx order
+ lines = NULL;
+ while (lineList) {
+ line0 = lineList;
+ lineList = lineList->next;
+ for (line1 = NULL, line2 = lines;
+ line2 && !line0->yxBefore(line2);
+ line1 = line2, line2 = line2->next) ;
+ if (line1) {
+ line1->next = line0;
+ } else {
+ lines = line0;
}
- if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax ||
- yxBlocks->xMin < blk0->xMax) {
- break;
+ line0->next = line2;
+ }
+
+ // compute whitespace to left and right of each line
+ line0 = lines;
+ for (line1 = lines; line1; line1 = line1->next) {
+
+ // find the first vertically overlapping line
+ for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ;
+
+ // check each vertically overlapping line -- look for the nearest
+ // on each side
+ line1->xSpaceL = 0;
+ line1->xSpaceR = pageWidth;
+ for (line2 = line0;
+ line2 && line2->yMin < line1->yMax;
+ line2 = line2->next) {
+ if (line2->yMax > line1->yMin) {
+ if (line2->xMax < line1->xMin) {
+ if (line2->xMax > line1->xSpaceL) {
+ line1->xSpaceL = line2->xMax;
+ }
+ } else if (line2->xMin > line1->xMax) {
+ if (line2->xMin < line1->xSpaceR) {
+ line1->xSpaceR = line2->xMin;
+ }
+ }
+ }
}
- blk1 = yxBlocks;
- yxBlocks = yxBlocks->next;
- blk0->next = blk1;
- blk1->next = NULL;
- if (blk1->yMin < line->yMin) {
- line->yMin = blk1->yMin;
+ }
+ } // (!rawOrder)
+
+#if 0 // for debugging
+ printf("*** lines in yx order ***\n");
+ for (line0 = lines; line0; line0 = line0->next) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->fontSize, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
}
- if (blk1->yMax > line->yMax) {
- line->yMax = blk1->yMax;
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ lineList = lines;
+ yxBlocks = NULL;
+ blk0 = NULL;
+ while (lineList) {
+
+ // build a new block object
+ line0 = lineList;
+ lineList = lineList->next;
+ line0->next = NULL;
+ blk1 = new TextBlock();
+ blk1->lines = line0;
+ blk1->xMin = line0->xMin;
+ blk1->xMax = line0->xMax;
+ blk1->yMin = line0->yMin;
+ blk1->yMax = line0->yMax;
+ blk1->xSpaceL = line0->xSpaceL;
+ blk1->xSpaceR = line0->xSpaceR;
+ blk1->maxFontSize = line0->fontSize;
+
+ // find subsequent lines in the block
+ while (lineList) {
+
+ // look for the first horizontally overlapping line below this
+ // one
+ yLimit = line0->yMax + blkMaxSpacing * line0->fontSize;
+ line3 = line4 = NULL;
+ if (rawOrder) {
+ if (lineList->yMin < yLimit &&
+ lineList->xMax > blk1->xMin &&
+ lineList->xMin < blk1->xMax) {
+ line3 = NULL;
+ line4 = lineList;
+ }
+ } else {
+ for (line1 = NULL, line2 = lineList;
+ line2 && line2->yMin < yLimit;
+ line1 = line2, line2 = line2->next) {
+ if (line2->xMax > blk1->xMin &&
+ line2->xMin < blk1->xMax) {
+ line3 = line1;
+ line4 = line2;
+ break;
+ }
+ }
+ }
+
+ // if there is an overlapping line and it fits in the block, add
+ // it to the block
+ if (line4 && blockFit(blk1, line4)) {
+ if (line3) {
+ line3->next = line4->next;
+ } else {
+ lineList = line4->next;
+ }
+ line0->next = line0->flowNext = line4;
+ line4->next = NULL;
+ if (line4->xMin < blk1->xMin) {
+ blk1->xMin = line4->xMin;
+ } else if (line4->xMax > blk1->xMax) {
+ blk1->xMax = line4->xMax;
+ }
+ if (line4->yMax > blk1->yMax) {
+ blk1->yMax = line4->yMax;
+ }
+ if (line4->xSpaceL > blk1->xSpaceL) {
+ blk1->xSpaceL = line4->xSpaceL;
+ }
+ if (line4->xSpaceR < blk1->xSpaceR) {
+ blk1->xSpaceR = line4->xSpaceR;
+ }
+ if (line4->fontSize > blk1->maxFontSize) {
+ blk1->maxFontSize = line4->fontSize;
+ }
+ line0 = line4;
+
+ // otherwise, we're done with this block
+ } else {
+ break;
}
+ }
+
+ // insert block on list, in yx order
+ if (rawOrder) {
+ blk2 = blk0;
+ blk3 = NULL;
blk0 = blk1;
+ } else {
+ for (blk2 = NULL, blk3 = yxBlocks;
+ blk3 && !blk1->yxBefore(blk3);
+ blk2 = blk3, blk3 = blk3->next) ;
}
- if (line0) {
- line0->next = line;
+ blk1->next = blk3;
+ if (blk2) {
+ blk2->next = blk1;
} else {
- lines = line;
+ yxBlocks = blk1;
}
- line->next = NULL;
- line0 = line;
}
-
- // sort the blocks into xy order
- xyBlocks = NULL;
- for (line = lines; line; line = line->next) {
- for (blk = line->blocks; blk; blk = blk->next) {
- for (blk1 = NULL, blk2 = xyBlocks;
- blk2 && !xyBefore(blk, blk2);
- blk1 = blk2, blk2 = blk2->xyNext) ;
- blk->xyNext = blk2;
- if (blk1) {
- blk1->xyNext = blk;
- } else {
- xyBlocks = blk;
+#if 0 // for debugging
+ printf("*** blocks in yx order ***\n");
+ for (blk0 = yxBlocks; blk0; blk0 = blk0->next) {
+ printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
+ blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
+ for (line0 = blk0->lines; line0; line0 = line0->next) {
+ printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
}
}
}
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- merge lines and blocks, sort blocks into reading order
+
+ if (rawOrder) {
+ blocks = yxBlocks;
+
+ } else {
+ blocks = NULL;
+ blk0 = NULL;
+ blkStack = NULL;
+ while (yxBlocks) {
-#if 0 //~ for debugging
- for (blk = xyBlocks; blk; blk = blk->xyNext) {
- printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n",
- blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len);
- TextString *str;
- for (str = blk->strings; str; str = str->next) {
- printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
- str->xMin, str->xMax, str->yMin, str->yMax,
- (str->yMax - str->yMin));
- for (i = 0; i < str->len; ++i) {
- fputc(str->text[i] & 0xff, stdout);
+ // find the next two blocks:
+ // - if the depth-first traversal stack is empty, take the first
+ // (upper-left-most) two blocks on the yx-sorted block list
+ // - otherwise, find the two upper-left-most blocks under the top
+ // block on the stack
+ if (blkStack) {
+ blk3 = blk4 = blk5 = blk6 = NULL;
+ for (blk1 = NULL, blk2 = yxBlocks;
+ blk2;
+ blk1 = blk2, blk2 = blk2->next) {
+ if (blk2->yMin > blkStack->yMin &&
+ blk2->xMax > blkStack->xMin &&
+ blk2->xMin < blkStack->xMax) {
+ if (!blk4 || blk2->yxBefore(blk4)) {
+ blk5 = blk3;
+ blk6 = blk4;
+ blk3 = blk1;
+ blk4 = blk2;
+ } else if (!blk6 || blk2->yxBefore(blk6)) {
+ blk5 = blk1;
+ blk6 = blk2;
+ }
+ }
+ }
+ } else {
+ blk3 = NULL;
+ blk4 = yxBlocks;
+ blk5 = yxBlocks;
+ blk6 = yxBlocks->next;
+ }
+
+ // merge case 1:
+ // | | |
+ // | blkStack | | blkStack
+ // +---------------------+ --> +--------------
+ // +------+ +------+ +-----------+
+ // | blk4 | | blk6 | ... | blk4+blk6 |
+ // +------+ +------+ +-----------+
+ if (blkStack) {
+ yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize;
+ }
+ if (blkStack && blk4 && blk6 &&
+ !blk4->lines->next && !blk6->lines->next &&
+ lineFit2(blk4->lines, blk6->lines) &&
+ blk4->yMin < yLimit &&
+ blk4->xMin > blkStack->xSpaceL &&
+ blkStack->xMin > blk4->xSpaceL &&
+ blk6->xMax < blkStack->xSpaceR) {
+ blk4->mergeRight(blk6);
+ if (blk5) {
+ blk5->next = blk6->next;
+ } else {
+ yxBlocks = blk6->next;
+ }
+ delete blk6;
+
+ // merge case 2:
+ // | | | |
+ // | blkStack | | |
+ // +---------------------+ --> | blkStack+blk2 |
+ // +---------------------+ | |
+ // | blk4 | | |
+ // | | | |
+ } else if (blkStack && blk4 &&
+ blk4->yMin < yLimit &&
+ blockFit2(blkStack, blk4)) {
+ blkStack->mergeBelow(blk4);
+ if (blk3) {
+ blk3->next = blk4->next;
+ } else {
+ yxBlocks = blk4->next;
+ }
+ delete blk4;
+
+ // if any of:
+ // 1. no block found
+ // 2. non-fully overlapping block found
+ // 3. large vertical gap above the overlapping block
+ // then pop the stack and try again
+ } else if (!blk4 ||
+ (blkStack && (blk4->xMin < blkStack->xSpaceL ||
+ blk4->xMax > blkStack->xSpaceR ||
+ blk4->yMin - blkStack->yMax >
+ blkMaxSortSpacing * blkStack->maxFontSize))) {
+ blkStack = blkStack->stackNext;
+
+ // add a block to the sorted list
+ } else {
+
+ // remove the block from the yx-sorted list
+ if (blk3) {
+ blk3->next = blk4->next;
+ } else {
+ yxBlocks = blk4->next;
+ }
+ blk4->next = NULL;
+
+ // append the block to the reading-order list
+ if (blk0) {
+ blk0->next = blk4;
+ } else {
+ blocks = blk4;
+ }
+ blk0 = blk4;
+
+ // push the block on the traversal stack
+ blk4->stackNext = blkStack;
+ blkStack = blk4;
+ }
+ }
+ } // (!rawOrder)
+
+#if 0 // for debugging
+ printf("*** blocks in reading order (after merging) ***\n");
+ for (blk0 = blocks; blk0; blk0 = blk0->next) {
+ printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n",
+ blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax);
+ for (line0 = blk0->lines; line0; line0 = line0->next) {
+ printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
}
- printf("'\n");
}
}
- printf("\n------------------------------------------------------------\n\n");
+ printf("\n");
+ fflush(stdout);
#endif
- // do column assignment
- for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) {
- col1 = 0;
- for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) {
- if (blk1->xMin >= blk2->xMax) {
- d = (int)((blk1->xMin - blk2->xMax) /
- (0.4 * (blk1->yMax - blk1->yMin)));
- if (d > 4) {
- d = 4;
+ //----- assemble blocks into flows
+
+ if (rawOrder) {
+
+ // one flow per block
+ flow0 = NULL;
+ while (blocks) {
+ flow1 = new TextFlow();
+ flow1->blocks = blocks;
+ flow1->lines = blocks->lines;
+ flow1->yMin = blocks->yMin;
+ flow1->yMax = blocks->yMax;
+ blocks = blocks->next;
+ flow1->blocks->next = NULL;
+ if (flow0) {
+ flow0->next = flow1;
+ } else {
+ flows = flow1;
+ }
+ flow0 = flow1;
+ }
+
+ } else {
+
+ // compute whitespace above and below each block
+ for (blk0 = blocks; blk0; blk0 = blk0->next) {
+ blk0->ySpaceT = 0;
+ blk0->ySpaceB = pageHeight;
+
+ // check each horizontally overlapping block
+ for (blk1 = blocks; blk1; blk1 = blk1->next) {
+ if (blk1 != blk0 &&
+ blk1->xMin < blk0->xMax &&
+ blk1->xMax > blk0->xMin) {
+ if (blk1->yMax < blk0->yMin) {
+ if (blk1->yMax > blk0->ySpaceT) {
+ blk0->ySpaceT = blk1->yMax;
+ }
+ } else if (blk1->yMin > blk0->yMax) {
+ if (blk1->yMin < blk0->ySpaceB) {
+ blk0->ySpaceB = blk1->yMin;
+ }
+ }
}
- col2 = blk2->col[0] + blk2->convertedLen + d;
- if (col2 > col1) {
- col1 = col2;
+ }
+ }
+
+ flow0 = NULL;
+ while (blocks) {
+
+ // build a new flow object
+ flow1 = new TextFlow();
+ flow1->blocks = blocks;
+ flow1->lines = blocks->lines;
+ flow1->yMin = blocks->yMin;
+ flow1->yMax = blocks->yMax;
+ flow1->ySpaceT = blocks->ySpaceT;
+ flow1->ySpaceB = blocks->ySpaceB;
+
+ // find subsequent blocks in the flow
+ for (blk1 = blocks, blk2 = blocks->next;
+ blk2 && flowFit(flow1, blk2);
+ blk1 = blk2, blk2 = blk2->next) {
+ if (blk2->yMin < flow1->yMin) {
+ flow1->yMin = blk2->yMin;
}
- } else if (blk1->xMin > blk2->xMin) {
- for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ;
- col2 = blk2->col[i];
- if (col2 > col1) {
- col1 = col2;
+ if (blk2->yMax > flow1->yMax) {
+ flow1->yMax = blk2->yMax;
+ }
+ if (blk2->ySpaceT > flow1->ySpaceT) {
+ flow1->ySpaceT = blk2->ySpaceT;
}
+ if (blk2->ySpaceB < flow1->ySpaceB) {
+ flow1->ySpaceB = blk2->ySpaceB;
+ }
+ for (line1 = blk1->lines; line1->next; line1 = line1->next) ;
+ line1->flowNext = blk2->lines;
+ }
+
+ // chop the block list
+ blocks = blk1->next;
+ blk1->next = NULL;
+
+ // append the flow to the list
+ if (flow0) {
+ flow0->next = flow1;
+ } else {
+ flows = flow1;
}
+ flow0 = flow1;
}
- for (j = 0; j < blk1->len; ++j) {
- blk1->col[j] += col1;
+ }
+
+#if 0 // for debugging
+ printf("*** flows ***\n");
+ for (flow0 = flows; flow0; flow0 = flow0->next) {
+ printf("[flow]\n");
+ for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) {
+ printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n",
+ blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax,
+ blk0->ySpaceT, blk0->ySpaceB);
+ for (line0 = blk0->lines; line0; line0 = line0->next) {
+ printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
}
}
+ printf("\n");
+ fflush(stdout);
+#endif
+
+ //----- sort lines into yx order
-#if 0 //~ for debugging
- for (line = lines; line; line = line->next) {
- printf("[line]\n");
- for (blk = line->blocks; blk; blk = blk->next) {
- printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len);
- TextString *str;
- for (str = blk->strings; str; str = str->next) {
- printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '",
- str->xMin, str->xMax, str->yMin, str->yMax,
- (str->yMax - str->yMin));
- for (i = 0; i < str->len; ++i) {
- fputc(str->text[i] & 0xff, stdout);
+ // (the block/line merging process doesn't maintain the full-page
+ // linked list of lines)
+
+ lines = NULL;
+ if (rawOrder) {
+ line0 = NULL;
+ for (flow0 = flows; flow0; flow0 = flow0->next) {
+ for (line1 = flow0->lines; line1; line1 = line1->flowNext) {
+ if (line0) {
+ line0->pageNext = line1;
+ } else {
+ lines = line1;
}
- if (str->spaceAfter) {
- printf(" [space]\n");
+ line0 = line1;
+ }
+ }
+ } else {
+ for (flow0 = flows; flow0; flow0 = flow0->next) {
+ for (line0 = flow0->lines; line0; line0 = line0->flowNext) {
+ for (line1 = NULL, line2 = lines;
+ line2 && !line0->yxBefore(line2);
+ line1 = line2, line2 = line2->pageNext) ;
+ if (line1) {
+ line1->pageNext = line0;
+ } else {
+ lines = line0;
}
- printf("'\n");
+ line0->pageNext = line2;
}
}
}
- printf("\n------------------------------------------------------------\n\n");
+
+#if 0 // for debugging
+ printf("*** lines in yx order ***\n");
+ for (line0 = lines; line0; line0 = line0->pageNext) {
+ printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n",
+ line0->xMin, line0->xMax, line0->yMin, line0->yMax,
+ line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0],
+ line0->len);
+ for (word0 = line0->words; word0; word0 = word0->next) {
+ printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '",
+ word0->xMin, word0->xMax, word0->yMin, word0->yMax,
+ word0->yBase, word0->spaceAfter);
+ for (i = 0; i < word0->len; ++i) {
+ fputc(word0->text[i] & 0xff, stdout);
+ }
+ printf("'\n");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
#endif
}
+// Returns a non-negative number if <word> can be added to <line>
+// (whose last word is <lastWord>). A smaller return value indicates
+// a better fit. If <word> cannot be added to <line> at all, returns
+// a negative number.
+double TextPage::lineFit(TextLine *line, TextWord *lastWord, TextWord *word) {
+ double fontSize0, fontSize1;
+ double dx, dxLimit;
+
+ fontSize0 = line->fontSize;
+ fontSize1 = word->fontSize;
+ dx = word->xMin - lastWord->xMax;
+ dxLimit = fontSize0 * line->font->maxSpaceWidth;
+
+ // check inter-word spacing
+ if (dx < fontSize0 * lineMinDeltaX ||
+ dx > dxLimit) {
+ return -1;
+ }
+
+ // ensure a non-negative return value
+ if (dx < 0) {
+ dx = 0;
+ }
+
+ // look for adjacent words with close baselines and close font sizes
+ if (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 &&
+ fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
+ fontSize1 < lineMaxFontSizeRatio * fontSize0) {
+ return dx;
+ }
+
+ // look for a superscript
+ if (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 &&
+ fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 &&
+ (word->yMax < lastWord->yMax ||
+ word->yBase < lastWord->yBase) &&
+ word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 &&
+ dx < fontSize0 * lineMaxSuperscriptDeltaX) {
+ return dx;
+ }
+
+ // look for a subscript
+ if (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 &&
+ fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 &&
+ (word->yMin > lastWord->yMin ||
+ word->yBase > lastWord->yBase) &&
+ line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 &&
+ dx < fontSize0 * lineMaxSubscriptDeltaX) {
+ return dx;
+ }
+
+ return -1;
+}
+
+// Returns true if <line0> and <line1> can be merged into a single
+// line, ignoring max word spacing.
+GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) {
+ double fontSize0, fontSize1;
+ double dx;
+
+ fontSize0 = line0->fontSize;
+ fontSize1 = line1->fontSize;
+ dx = line1->xMin - line0->xMax;
+
+ // check inter-word spacing
+ if (dx < fontSize0 * lineMinDeltaX) {
+ return gFalse;
+ }
+
+ // look for close baselines and close font sizes
+ if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 &&
+ fontSize0 < lineMaxFontSizeRatio * fontSize1 &&
+ fontSize1 < lineMaxFontSizeRatio * fontSize0) {
+ return gTrue;
+ }
+
+ return gFalse;
+}
+
+// Returns true if <line> can be added to <blk>. Assumes the y
+// coordinates are within range.
+GBool TextPage::blockFit(TextBlock *blk, TextLine *line) {
+ double fontSize0, fontSize1;
+
+ // check edges
+ if (line->xMin < blk->xSpaceL ||
+ line->xMax > blk->xSpaceR ||
+ blk->xMin < line->xSpaceL ||
+ blk->xMax > line->xSpaceR) {
+ return gFalse;
+ }
+
+ // check font sizes
+ fontSize0 = blk->lines->fontSize;
+ fontSize1 = line->fontSize;
+ if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
+ fontSize1 > blkMaxFontSizeRatio * fontSize0) {
+ return gFalse;
+ }
+
+ return gTrue;
+}
+
+// Returns true if <blk0> and <blk1> can be merged into a single
+// block. Assumes the y coordinates are within range.
+GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) {
+ double fontSize0, fontSize1;
+
+ // check edges
+ if (blk1->xMin < blk0->xSpaceL ||
+ blk1->xMax > blk0->xSpaceR ||
+ blk0->xMin < blk1->xSpaceL ||
+ blk0->xMax > blk1->xSpaceR) {
+ return gFalse;
+ }
+
+ // check font sizes
+ fontSize0 = blk0->lines->fontSize;
+ fontSize1 = blk1->lines->fontSize;
+ if (fontSize0 > blkMaxFontSizeRatio * fontSize1 ||
+ fontSize1 > blkMaxFontSizeRatio * fontSize0) {
+ return gFalse;
+ }
+
+ return gTrue;
+}
+
+// Returns true if <blk> can be added to <flow>.
+GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) {
+ double dy;
+
+ // check whitespace above and below
+ if (blk->yMin < flow->ySpaceT ||
+ blk->yMax > flow->ySpaceB ||
+ flow->yMin < blk->ySpaceT ||
+ flow->yMax > blk->ySpaceB) {
+ return gFalse;
+ }
+
+ // check that block top edge is within +/- dy of flow top edge,
+ // and that block bottom edge is above flow bottom edge + dy
+ dy = flowMaxDeltaY * flow->blocks->maxFontSize;
+ return blk->yMin > flow->yMin - dy &&
+ blk->yMin < flow->yMin + dy &&
+ blk->yMax < flow->yMax + dy;
+}
+
GBool TextPage::findText(Unicode *s, int len,
GBool top, GBool bottom,
double *xMin, double *yMin,
double *xMax, double *yMax) {
TextLine *line;
- TextBlock *blk;
Unicode *p;
Unicode u1, u2;
int m, i, j;
double x0, x1, x;
- // scan all blocks on page
- for (line = lines; line; line = line->next) {
- for (blk = line->blocks; blk; blk = blk->next) {
+ // scan all text on the page
+ for (line = lines; line; line = line->pageNext) {
- // check: above top limit?
- if (!top && (blk->yMax < *yMin ||
- (blk->yMin < *yMin && blk->xMax <= *xMin))) {
- continue;
- }
+ // check: above top limit?
+ if (!top && (line->yMax < *yMin ||
+ (line->yMin < *yMin && line->xMax <= *xMin))) {
+ continue;
+ }
- // check: below bottom limit?
- if (!bottom && (blk->yMin > *yMax ||
- (blk->yMax > *yMax && blk->xMin >= *xMax))) {
- return gFalse;
- }
+ // check: below bottom limit?
+ if (!bottom && (line->yMin > *yMax ||
+ (line->yMax > *yMax && line->xMin >= *xMax))) {
+ return gFalse;
+ }
- // search each position in this block
- m = blk->len;
- for (i = 0, p = blk->text; i <= m - len; ++i, ++p) {
+ // search each position in this line
+ m = line->len;
+ for (i = 0, p = line->text; i <= m - len; ++i, ++p) {
- x0 = (i == 0) ? blk->xMin : blk->xRight[i-1];
- x1 = blk->xRight[i];
- x = 0.5 * (x0 + x1);
+ x0 = (i == 0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
+ x = 0.5 * (x0 + x1);
- // check: above top limit?
- if (!top && blk->yMin < *yMin) {
- if (x < *xMin) {
- continue;
- }
+ // check: above top limit?
+ if (!top && line->yMin < *yMin) {
+ if (x < *xMin) {
+ continue;
}
+ }
- // check: below bottom limit?
- if (!bottom && blk->yMax > *yMax) {
- if (x > *xMax) {
- return gFalse;
- }
+ // check: below bottom limit?
+ if (!bottom && line->yMax > *yMax) {
+ if (x > *xMax) {
+ return gFalse;
}
+ }
- // compare the strings
- for (j = 0; j < len; ++j) {
+ // compare the strings
+ for (j = 0; j < len; ++j) {
#if 1 //~ this lowercases Latin A-Z only -- this will eventually be
- //~ extended to handle other character sets
- if (p[j] >= 0x41 && p[j] <= 0x5a) {
- u1 = p[j] + 0x20;
- } else {
- u1 = p[j];
- }
- if (s[j] >= 0x41 && s[j] <= 0x5a) {
- u2 = s[j] + 0x20;
- } else {
- u2 = s[j];
- }
+ //~ extended to handle other character sets
+ if (p[j] >= 0x41 && p[j] <= 0x5a) {
+ u1 = p[j] + 0x20;
+ } else {
+ u1 = p[j];
+ }
+ if (s[j] >= 0x41 && s[j] <= 0x5a) {
+ u2 = s[j] + 0x20;
+ } else {
+ u2 = s[j];
+ }
#endif
- if (u1 != u2) {
- break;
- }
+ if (u1 != u2) {
+ break;
}
+ }
- // found it
- if (j == len) {
- *xMin = x0;
- *xMax = blk->xRight[i + len - 1];
- *yMin = blk->yMin;
- *yMax = blk->yMax;
- return gTrue;
- }
+ // found it
+ if (j == len) {
+ *xMin = x0;
+ *xMax = line->xRight[i + len - 1];
+ *yMin = line->yMin;
+ *yMax = line->yMax;
+ return gTrue;
}
}
}
@@ -870,8 +1692,7 @@ GString *TextPage::getText(double xMin, double yMin,
GBool isUnicode;
char space[8], eol[16], buf[8];
int spaceLen, eolLen, len;
- TextLine *line;
- TextBlock *blk;
+ TextLine *line, *prevLine;
double x0, x1, y;
int firstCol, col, i;
GBool multiLine;
@@ -899,40 +1720,32 @@ GString *TextPage::getText(double xMin, double yMin,
}
// find the leftmost column
- multiLine = gFalse;
firstCol = -1;
- for (line = lines; line; line = line->next) {
+ for (line = lines; line; line = line->pageNext) {
if (line->yMin > yMax) {
break;
}
- if (line->yMax < yMin) {
- continue;
- }
-
- for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
- if (!blk || blk->xMin > xMax) {
+ if (line->yMax < yMin ||
+ line->xMax < xMin ||
+ line->xMin > xMax) {
continue;
}
- y = 0.5 * (blk->yMin + blk->yMax);
+ y = 0.5 * (line->yMin + line->yMax);
if (y < yMin || y > yMax) {
continue;
}
- if (firstCol >= 0) {
- multiLine = gTrue;
- }
-
i = 0;
while (1) {
- x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
- x1 = blk->xRight[i];
+ x0 = (i==0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
if (0.5 * (x0 + x1) > xMin) {
break;
}
++i;
}
- col = blk->col[i];
+ col = line->col[i];
if (firstCol < 0 || col < firstCol) {
firstCol = col;
@@ -940,83 +1753,82 @@ GString *TextPage::getText(double xMin, double yMin,
}
// extract the text
- for (line = lines; line; line = line->next) {
+ col = firstCol;
+ multiLine = gFalse;
+ for (prevLine = NULL, line = lines;
+ line;
+ prevLine = line, line = line->pageNext) {
if (line->yMin > yMax) {
break;
}
- if (line->yMax < yMin) {
- continue;
- }
-
- for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ;
- if (!blk || blk->xMin > xMax) {
+ if (line->yMax < yMin ||
+ line->xMax < xMin ||
+ line->xMin > xMax) {
continue;
}
- y = 0.5 * (blk->yMin + blk->yMax);
+ y = 0.5 * (line->yMin + line->yMax);
if (y < yMin || y > yMax) {
continue;
}
i = 0;
while (1) {
- x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
- x1 = blk->xRight[i];
+ x0 = (i==0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
if (0.5 * (x0 + x1) > xMin) {
break;
}
++i;
}
- col = firstCol;
-
- do {
-
- // line this block up with the correct column
- for (; col < blk->col[i]; ++col) {
- s->append(space, spaceLen);
- }
+ // insert a return
+ if (col > line->col[i] ||
+ (prevLine &&
+ line->yMin >
+ prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) {
+ s->append(eol, eolLen);
+ col = firstCol;
+ multiLine = gTrue;
+ }
- // print the block
- for (; i < blk->len; ++i) {
+ // line this block up with the correct column
+ for (; col < line->col[i]; ++col) {
+ s->append(space, spaceLen);
+ }
- x0 = (i==0) ? blk->xMin : blk->xRight[i-1];
- x1 = blk->xRight[i];
- if (0.5 * (x0 + x1) > xMax) {
- break;
- }
+ // print the portion of the line
+ for (; i < line->len; ++i) {
- len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
- s->append(buf, len);
- col += isUnicode ? 1 : len;
- }
- if (i < blk->len) {
+ x0 = (i==0) ? line->xMin : line->xRight[i-1];
+ x1 = line->xRight[i];
+ if (0.5 * (x0 + x1) > xMax) {
break;
}
- // next block
- blk = blk->next;
- i = 0;
-
- } while (blk && blk->xMin < xMax);
-
- if (multiLine) {
- s->append(eol, eolLen);
+ len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+ s->append(buf, len);
+ col += isUnicode ? 1 : len;
}
}
+ if (multiLine) {
+ s->append(eol, eolLen);
+ }
+
uMap->decRefCnt();
return s;
}
-void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
+void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
+ GBool physLayout) {
UnicodeMap *uMap;
char space[8], eol[16], eop[8], buf[8];
int spaceLen, eolLen, eopLen, len;
+ TextFlow *flow;
TextLine *line;
- TextBlock *blk;
- int col, d, i;
+ int col, d, n, i;
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
@@ -1038,142 +1850,121 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) {
}
eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
- // output
- for (line = lines; line; line = line->next) {
+ // output the page, maintaining the original physical layout
+ if (physLayout || rawOrder) {
col = 0;
- for (blk = line->blocks; blk; blk = blk->next) {
+ for (line = lines; line; line = line->pageNext) {
// line this block up with the correct column
- if (rawOrder && col == 0) {
- col = blk->col[0];
- } else {
- for (; col < blk->col[0]; ++col) {
+ if (!rawOrder) {
+ for (; col < line->col[0]; ++col) {
(*outputFunc)(outputStream, space, spaceLen);
}
}
- // print the block
- for (i = 0; i < blk->len; ++i) {
- len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf));
+ // print the line
+ for (i = 0; i < line->len; ++i) {
+ len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
(*outputFunc)(outputStream, buf, len);
}
- col += blk->convertedLen;
- }
+ col += line->convertedLen;
+
+ // print one or more returns if necessary
+ if (!line->pageNext ||
+ line->pageNext->col[0] < col ||
+ line->pageNext->yMin >
+ line->yMax - lineOverlapSlack * line->fontSize) {
+
+ // compute number of returns
+ d = 1;
+ if (line->pageNext) {
+ d += (int)((line->pageNext->yMin - line->yMax) /
+ line->fontSize + 0.5);
+ }
+
+ // various things (weird font matrices) can result in bogus
+ // values here, so do a sanity check
+ if (d < 1) {
+ d = 1;
+ } else if (d > 5) {
+ d = 5;
+ }
+ for (; d > 0; --d) {
+ (*outputFunc)(outputStream, eol, eolLen);
+ }
- // print a return
- (*outputFunc)(outputStream, eol, eolLen);
-
- // print extra vertical space if necessary
- if (line->next) {
- d = (int)((line->next->yMin - line->yMax) /
- (line->blocks->strings->yMax - lines->blocks->strings->yMin)
- + 0.5);
- // various things (weird font matrices) can result in bogus
- // values here, so do a sanity check
- if (rawOrder && d > 2) {
- d = 2;
- } else if (!rawOrder && d > 5) {
- d = 5;
+ col = 0;
}
- for (; d > 0; --d) {
- (*outputFunc)(outputStream, eol, eolLen);
+ }
+
+ // output the page, "undoing" the layout
+ } else {
+ for (flow = flows; flow; flow = flow->next) {
+ for (line = flow->lines; line; line = line->flowNext) {
+ n = line->len;
+ if (line->flowNext && line->hyphenated) {
+ --n;
+ }
+ for (i = 0; i < n; ++i) {
+ len = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
+ (*outputFunc)(outputStream, buf, len);
+ }
+ if (line->flowNext && !line->hyphenated) {
+ (*outputFunc)(outputStream, space, spaceLen);
+ }
}
+ (*outputFunc)(outputStream, eol, eolLen);
+ (*outputFunc)(outputStream, eol, eolLen);
}
}
// end of page
- (*outputFunc)(outputStream, eol, eolLen);
(*outputFunc)(outputStream, eop, eopLen);
(*outputFunc)(outputStream, eol, eolLen);
uMap->decRefCnt();
}
-// Returns true if <str1> should be inserted before <str2> in xy
-// order.
-GBool TextPage::xyBefore(TextString *str1, TextString *str2) {
- return str1->xMin < str2->xMin ||
- (str1->xMin == str2->xMin && str1->yMin < str2->yMin);
-}
-
-// Returns true if <blk1> should be inserted before <blk2> in xy
-// order.
-GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) {
- return blk1->xMin < blk2->xMin ||
- (blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin);
-}
-
-// Returns true if <blk1> should be inserted before <blk2> in yx
-// order, allowing a little slack for vertically overlapping text.
-GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) {
- double h1, h2, overlap;
-
- h1 = blk1->yMax - blk1->yMin;
- h2 = blk2->yMax - blk2->yMin;
- overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) -
- (blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) /
- (h1 < h2 ? h1 : h2);
- if (overlap > 0.6) {
- return blk1->xMin < blk2->xMin;
- }
- return blk1->yMin < blk2->yMin;
-}
-
-double TextPage::coalesceFit(TextString *str1, TextString *str2) {
- double h1, h2, w1, w2, r, overlap, spacing;
-
- h1 = str1->yMax - str1->yMin;
- h2 = str2->yMax - str2->yMin;
- w1 = str1->xMax - str1->xMin;
- w2 = str2->xMax - str2->xMin;
- r = h1 / h2;
- if (r < (1.0 / 3.0) || r > 3) {
- return 10;
- }
- overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) -
- (str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) /
- (h1 < h2 ? h1 : h2);
- if (overlap < 0.5) {
- return 10;
- }
- spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2);
- if (spacing < -0.5) {
- return 10;
- }
- // separate text that overlaps - duplicated text (so that fake
- // boldface and shadowed text can be cleanly removed)
- if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) {
- return 10;
- }
- return spacing;
+void TextPage::startPage(GfxState *state) {
+ clear();
+ pageWidth = state->getPageWidth();
+ pageHeight = state->getPageHeight();
}
void TextPage::clear() {
- TextLine *p1, *p2;
- TextString *s1, *s2;
+ TextWord *w1, *w2;
+ TextFlow *f1, *f2;
- if (curStr) {
- delete curStr;
- curStr = NULL;
+ if (curWord) {
+ delete curWord;
+ curWord = NULL;
}
- if (lines) {
- for (p1 = lines; p1; p1 = p2) {
- p2 = p1->next;
- delete p1;
+ if (words) {
+ for (w1 = words; w1; w1 = w2) {
+ w2 = w1->next;
+ delete w1;
}
- } else if (xyStrings) {
- for (s1 = xyStrings; s1; s1 = s2) {
- s2 = s1->next;
- delete s1;
+ } else if (flows) {
+ for (f1 = flows; f1; f1 = f2) {
+ f2 = f1->next;
+ delete f1;
}
}
- xyStrings = NULL;
- xyCur1 = xyCur2 = NULL;
- lines = NULL;
+ deleteGList(fonts, TextFontInfo);
+
+ curWord = NULL;
+ font = NULL;
+ fontSize = 0;
nest = 0;
nTinyChars = 0;
+ words = wordPtr = NULL;
+ lines = NULL;
+ flows = NULL;
+ fonts = new GList();
+
}
+
//------------------------------------------------------------------------
// TextOutputDev
//------------------------------------------------------------------------
@@ -1182,8 +1973,10 @@ static void outputToFile(void *stream, char *text, int len) {
fwrite(text, 1, len, (FILE *)stream);
}
-TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) {
+TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
+ GBool rawOrderA, GBool append) {
text = NULL;
+ physLayout = physLayoutA;
rawOrder = rawOrderA;
ok = gTrue;
@@ -1205,16 +1998,17 @@ TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) {
}
// set up text object
- text = new TextPage(rawOrder);
+ text = new TextPage(rawOrderA);
}
TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
- GBool rawOrderA) {
+ GBool physLayoutA, GBool rawOrderA) {
outputFunc = func;
outputStream = stream;
needClose = gFalse;
+ physLayout = physLayoutA;
rawOrder = rawOrderA;
- text = new TextPage(rawOrder);
+ text = new TextPage(rawOrderA);
ok = gTrue;
}
@@ -1231,13 +2025,13 @@ TextOutputDev::~TextOutputDev() {
}
void TextOutputDev::startPage(int pageNum, GfxState *state) {
- text->clear();
+ text->startPage(state);
}
void TextOutputDev::endPage() {
text->coalesce();
if (outputStream) {
- text->dump(outputStream, outputFunc);
+ text->dump(outputStream, outputFunc, physLayout);
}
}
@@ -1246,18 +2040,18 @@ void TextOutputDev::updateFont(GfxState *state) {
}
void TextOutputDev::beginString(GfxState *state, GString *s) {
- text->beginString(state, state->getCurX(), state->getCurY());
+ text->beginWord(state, state->getCurX(), state->getCurY());
}
void TextOutputDev::endString(GfxState *state) {
- text->endString();
+ text->endWord();
}
void TextOutputDev::drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode c, Unicode *u, int uLen) {
- text->addChar(state, x, y, dx, dy, u, uLen);
+ text->addChar(state, x, y, dx, dy, c, u, uLen);
}
GBool TextOutputDev::findText(Unicode *s, int len,
@@ -1272,3 +2066,5 @@ GString *TextOutputDev::getText(double xMin, double yMin,
return text->getText(xMin, yMin, xMax, yMax);
}
+
+
diff --git a/pdf/xpdf/TextOutputDev.h b/pdf/xpdf/TextOutputDev.h
index f681ecf..8e94f04 100644
--- a/pdf/xpdf/TextOutputDev.h
+++ b/pdf/xpdf/TextOutputDev.h
@@ -20,12 +20,10 @@
#include "GfxFont.h"
#include "OutputDev.h"
-class GfxState;
class GString;
-class TextBlock;
-class TextLine;
-
-#undef TEXTOUT_DO_SYMBOLS
+class GList;
+class GfxFont;
+class GfxState;
//------------------------------------------------------------------------
@@ -33,43 +31,165 @@ typedef void (*TextOutputFunc)(void *stream, char *text, int len);
//------------------------------------------------------------------------
-// TextString
+// TextFontInfo
//------------------------------------------------------------------------
-class TextString {
+class TextFontInfo {
+public:
+
+ TextFontInfo(GfxState *state);
+ ~TextFontInfo();
+
+ GBool matches(GfxState *state);
+
+private:
+
+ GfxFont *gfxFont;
+ double horizScaling;
+
+ double minSpaceWidth; // min width for inter-word space, as a
+ // fraction of the font size
+ double maxSpaceWidth; // max width for inter-word space, as a
+ // fraction of the font size
+
+
+ friend class TextWord;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextWord
+//------------------------------------------------------------------------
+
+class TextWord {
public:
// Constructor.
- TextString(GfxState *state, double x0, double y0,
- double fontSize);
+ TextWord(GfxState *state, double x0, double y0,
+ TextFontInfo *fontA, double fontSize);
// Destructor.
- ~TextString();
+ ~TextWord();
- // Add a character to the string.
+ // Add a character to the word.
void addChar(GfxState *state, double x, double y,
double dx, double dy, Unicode u);
+
private:
+ GBool xyBefore(TextWord *word2);
+ void merge(TextWord *word2);
+
double xMin, xMax; // bounding box x coordinates
double yMin, yMax; // bounding box y coordinates
- union {
- GBool marked; // temporary flag used by coalesce()
- GBool spaceAfter; // insert a space after this string?
- };
+ double yBase; // baseline y coordinate
Unicode *text; // the text
double *xRight; // right-hand x coord of each char
int len; // length of text and xRight
int size; // size of text and xRight arrays
- TextString *next;
+ TextFontInfo *font; // font information
+ double fontSize; // font size
+ GBool spaceAfter; // set if there is a space between this
+ // word and the next word on the line
+ TextWord *next; // next word in line (before lines are
+ // assembled: next word in xy order)
+
+ friend class TextLine;
friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+class TextLine {
+public:
+
+ TextLine();
+ ~TextLine();
+
+private:
+
+ GBool yxBefore(TextLine *line2);
+ void merge(TextLine *line2);
+
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double yBase; // primary baseline y coordinate
+ double xSpaceL, xSpaceR; // whitespace to left and right of this line
+ TextFontInfo *font; // primary font
+ double fontSize; // primary font size
+ TextWord *words; // words in this line
+ Unicode *text; // Unicode text of the line, including
+ // spaces between words
+ double *xRight; // right-hand x coord of each Unicode char
+ int *col; // starting column number of each Unicode char
+ int len; // number of Unicode chars
+ int convertedLen; // total number of converted characters
+ GBool hyphenated; // set if last char is a hyphen
+ TextLine *pageNext; // next line on page
+ TextLine *next; // next line in block
+ TextLine *flowNext; // next line in flow
+
friend class TextBlock;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+class TextBlock {
+public:
+
+ TextBlock();
+ ~TextBlock();
+
+private:
+
+ GBool yxBefore(TextBlock *blk2);
+ void mergeRight(TextBlock *blk2);
+ void mergeBelow(TextBlock *blk2);
+
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double xSpaceL, xSpaceR; // whitespace to left and right of this block
+ double ySpaceT, ySpaceB; // whitespace above and below this block
+ double maxFontSize; // max primary font size
+ TextLine *lines; // lines in block
+ TextBlock *next; // next block in flow
+ TextBlock *stackNext; // next block on traversal stack
+
+ friend class TextFlow;
+ friend class TextPage;
};
//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+class TextFlow {
+public:
+
+ TextFlow();
+ ~TextFlow();
+
+private:
+
+ double yMin, yMax; // bounding box y coordinates
+ double ySpaceT, ySpaceB; // whitespace above and below this flow
+ TextBlock *blocks; // blocks in flow
+ TextLine *lines; // lines in flow
+ TextFlow *next; // next flow on page
+
+ friend class TextPage;
+};
+
+
+//------------------------------------------------------------------------
// TextPage
//------------------------------------------------------------------------
@@ -77,7 +197,7 @@ class TextPage {
public:
// Constructor.
- TextPage(GBool rawOrderA);
+ TextPage(GBool rawOrder);
// Destructor.
~TextPage();
@@ -86,18 +206,19 @@ public:
void updateFont(GfxState *state);
- // Begin a new string.
- void beginString(GfxState *state, double x0, double y0);
+ // Begin a new word.
+ void beginWord(GfxState *state, double x0, double y0);
- // Add a character to the current string.
+ // Add a character to the current word.
void addChar(GfxState *state, double x, double y,
- double dx, double dy, Unicode *u, int uLen);
+ double dx, double dy,
+ CharCode c, Unicode *u, int uLen);
- // End the current string, sorting it into the list of strings.
- void endString();
+ // End the current word, sorting it into the list of words.
+ void endWord();
- // Add a string, sorting it into the list of strings.
- void addString(TextString *str);
+ // Add a word, sorting it into the list of words.
+ void addWord(TextWord *word);
// Coalesce strings that look like parts of the same line.
@@ -118,31 +239,41 @@ public:
double xMax, double yMax);
// Dump contents of page to a file.
- void dump(void *outputStream, TextOutputFunc outputFunc);
+ void dump(void *outputStream, TextOutputFunc outputFunc,
+ GBool physLayout);
+
+ // Start a new page.
+ void startPage(GfxState *state);
- // Clear the page.
- void clear();
private:
- GBool xyBefore(TextString *str1, TextString *str2);
- GBool xyBefore(TextBlock *blk1, TextBlock *blk2);
- GBool yxBefore(TextBlock *blk1, TextBlock *blk2);
- double coalesceFit(TextString *str1, TextString *str2);
+ void clear();
+ double lineFit(TextLine *line, TextWord *lastWord, TextWord *word);
+ GBool lineFit2(TextLine *line0, TextLine *line1);
+ GBool blockFit(TextBlock *blk, TextLine *line);
+ GBool blockFit2(TextBlock *blk0, TextBlock *blk1);
+ GBool flowFit(TextFlow *flow, TextBlock *blk);
- GBool rawOrder; // keep strings in content stream order
+ GBool rawOrder; // keep text in content stream order
- TextString *curStr; // currently active string
+ double pageWidth, pageHeight; // width and height of current page
+ TextWord *curWord; // currently active string
+ TextFontInfo *font; // current font
double fontSize; // current font size
+ int nest; // current nesting level (for Type 3 fonts)
+ int nTinyChars; // number of "tiny" chars seen so far
- TextString *xyStrings; // strings in x-major order (before
- // they're sorted into lines)
- TextString *xyCur1, *xyCur2; // cursors for xyStrings list
- TextLine *lines; // list of lines
+ TextWord *words; // words, in xy order (before they're
+ // sorted into lines)
+ TextWord *wordPtr; // cursor for the word list
- int nest; // current nesting level (for Type 3 fonts)
+ TextLine *lines; // lines, in xy order
+ TextFlow *flows; // flows, in reading order
+
+ GList *fonts; // all font info objects used on this
+ // page [TextFontInfo]
- int nTinyChars; // number of "tiny" chars seen so far
};
@@ -155,12 +286,18 @@ public:
// Open a text output file. If <fileName> is NULL, no file is
// written (this is useful, e.g., for searching text). If
- // <rawOrder> is true, the text is kept in content stream order.
- TextOutputDev(char *fileName, GBool rawOrderA, GBool append);
+ // <physLayoutA> is true, the original physical layout of the text
+ // is maintained. If <rawOrder> is true, the text is kept in
+ // content stream order.
+ TextOutputDev(char *fileName, GBool physLayoutA,
+ GBool rawOrderA, GBool append);
// Create a TextOutputDev which will write to a generic stream. If
- // <rawOrder> is true, the text is kept in content stream order.
- TextOutputDev(TextOutputFunc func, void *stream, GBool rawOrderA);
+ // <physLayoutA> is true, the original physical layout of the text
+ // is maintained. If <rawOrder> is true, the text is kept in
+ // content stream order.
+ TextOutputDev(TextOutputFunc func, void *stream,
+ GBool physLayoutA, GBool rawOrderA);
// Destructor.
virtual ~TextOutputDev();
@@ -221,6 +358,7 @@ public:
GString *getText(double xMin, double yMin,
double xMax, double yMax);
+
private:
TextOutputFunc outputFunc; // output function
@@ -228,6 +366,8 @@ private:
GBool needClose; // need to close the output file?
// (only if outputStream is a FILE*)
TextPage *text; // text for the current page
+ GBool physLayout; // maintain original physical layout when
+ // dumping text
GBool rawOrder; // keep text in content stream order
GBool ok; // set up ok?
diff --git a/pdf/xpdf/XOutputDev.cc b/pdf/xpdf/XOutputDev.cc
index 3c58f56..f9064fa 100644
--- a/pdf/xpdf/XOutputDev.cc
+++ b/pdf/xpdf/XOutputDev.cc
@@ -828,6 +828,7 @@ XOutputFont *XOutputFontCache::getFont(XRef *xref, GfxFont *gfxFont,
}
#endif
break;
+ case fontCIDType0:
case fontCIDType0C:
#if FREETYPE2 && (HAVE_FREETYPE_FREETYPE_H || HAVE_FREETYPE_H)
if (freetypeControl != fontRastNone) {
@@ -993,7 +994,7 @@ XOutputFont *XOutputFontCache::tryGetFont(XRef *xref, DisplayFontParam *dfp,
case displayFontT1:
#if HAVE_T1LIB_H
- if (t1libControl != fontRastNone) {
+ if (t1libControl != fontRastNone && !gfxFont->isCIDFont()) {
font = tryGetT1FontFromFile(xref, dfp->t1.fileName, gFalse, gfxFont,
m11Orig, m12Orig, m21Orig, m22Orig,
m11, m12, m21, m22, subst);
@@ -1304,7 +1305,7 @@ XOutputFont *XOutputFontCache::tryGetFTFontFromFile(XRef *xref,
fontFile = new FTFontFile(ftEngine, fileName->getCString(),
((GfxCIDFont *)gfxFont)->getCIDToGID(),
((GfxCIDFont *)gfxFont)->getCIDToGIDLen());
- } else { // fontCIDType0C
+ } else { // fontCIDType0, fontCIDType0C
fontFile = new FTFontFile(ftEngine, fileName->getCString());
}
} else {
@@ -1843,7 +1844,7 @@ void XOutputDev::startPage(int pageNum, GfxState *state) {
XFillRectangle(display, pixmap, paperGC, 0, 0, pixmapW, pixmapH);
// clear text object
- text->clear();
+ text->startPage(state);
}
void XOutputDev::endPage() {
@@ -1942,6 +1943,9 @@ void XOutputDev::restoreState(GfxState *state) {
s = save;
save = save->next;
delete s;
+
+ // restore the font
+ updateFont(state);
}
}
@@ -2483,11 +2487,11 @@ void XOutputDev::addPoint(XPoint **points, int *size, int *k, int x, int y) {
}
void XOutputDev::beginString(GfxState *state, GString *s) {
- text->beginString(state, state->getCurX(), state->getCurY());
+ text->beginWord(state, state->getCurX(), state->getCurY());
}
void XOutputDev::endString(GfxState *state) {
- text->endString();
+ text->endWord();
}
void XOutputDev::drawChar(GfxState *state, double x, double y,
@@ -2501,7 +2505,7 @@ void XOutputDev::drawChar(GfxState *state, double x, double y,
double *ctm;
double saveCTM[6];
- text->addChar(state, x, y, dx, dy, u, uLen);
+ text->addChar(state, x, y, dx, dy, code, u, uLen);
if (!font) {
return;
@@ -2676,7 +2680,7 @@ GBool XOutputDev::beginType3Char(GfxState *state,
}
text->addChar(state, 0, 0,
t3Font->cacheTags[i+j].wx, t3Font->cacheTags[i+j].wy,
- u, uLen);
+ code, u, uLen);
drawType3Glyph(t3Font, &t3Font->cacheTags[i+j],
t3Font->cacheData + (i+j) * t3Font->glyphSize,
xt, yt, &color);
@@ -2755,7 +2759,7 @@ void XOutputDev::endType3Char(GfxState *state) {
t3GlyphStack->origCTM4, t3GlyphStack->origCTM5);
}
text->addChar(state, 0, 0, t3GlyphStack->wx, t3GlyphStack->wy,
- t3GlyphStack->u, t3GlyphStack->uLen);
+ t3GlyphStack->code, t3GlyphStack->u, t3GlyphStack->uLen);
t3gs = t3GlyphStack;
t3GlyphStack = t3gs->next;
delete t3gs;
@@ -2850,11 +2854,61 @@ void XOutputDev::type3D1(GfxState *state, double wx, double wy,
XRectangle rect;
double *ctm;
T3FontCache *t3Font;
+ double xt, yt, xMin, xMax, yMin, yMax, x1, y1;
int i, j;
+ t3Font = t3GlyphStack->cache;
+ t3GlyphStack->wx = wx;
+ t3GlyphStack->wy = wy;
+
+ // check for a valid bbox
+ state->transform(0, 0, &xt, &yt);
+ state->transform(llx, lly, &x1, &y1);
+ xMin = xMax = x1;
+ yMin = yMax = y1;
+ state->transform(llx, ury, &x1, &y1);
+ if (x1 < xMin) {
+ xMin = x1;
+ } else if (x1 > xMax) {
+ xMax = x1;
+ }
+ if (y1 < yMin) {
+ yMin = y1;
+ } else if (y1 > yMax) {
+ yMax = y1;
+ }
+ state->transform(urx, lly, &x1, &y1);
+ if (x1 < xMin) {
+ xMin = x1;
+ } else if (x1 > xMax) {
+ xMax = x1;
+ }
+ if (y1 < yMin) {
+ yMin = y1;
+ } else if (y1 > yMax) {
+ yMax = y1;
+ }
+ state->transform(urx, ury, &x1, &y1);
+ if (x1 < xMin) {
+ xMin = x1;
+ } else if (x1 > xMax) {
+ xMax = x1;
+ }
+ if (y1 < yMin) {
+ yMin = y1;
+ } else if (y1 > yMax) {
+ yMax = y1;
+ }
+ if (xMin - xt < t3Font->glyphX ||
+ yMin - yt < t3Font->glyphY ||
+ xMax - xt > t3Font->glyphX + t3Font->glyphW ||
+ yMax - yt > t3Font->glyphY + t3Font->glyphH) {
+ error(-1, "Bad bounding box in Type 3 glyph");
+ return;
+ }
+
// allocate a cache entry
t3GlyphStack->cacheable = gTrue;
- t3Font = t3GlyphStack->cache;
i = t3GlyphStack->cacheIdx;
for (j = 0; j < t3Font->cacheAssoc; ++j) {
if ((t3Font->cacheTags[i+j].mru & 0x7fff) == t3Font->cacheAssoc - 1) {
@@ -2866,8 +2920,6 @@ void XOutputDev::type3D1(GfxState *state, double wx, double wy,
++t3Font->cacheTags[i+j].mru;
}
}
- t3GlyphStack->wx = wx;
- t3GlyphStack->wy = wy;
t3GlyphStack->cacheTag->wx = wx;
t3GlyphStack->cacheTag->wy = wy;
diff --git a/pdf/xpdf/pdftotext.cc b/pdf/xpdf/pdftotext.cc
index 150954f..8b13ff2 100644
--- a/pdf/xpdf/pdftotext.cc
+++ b/pdf/xpdf/pdftotext.cc
@@ -35,6 +35,7 @@ static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);
static int firstPage = 1;
static int lastPage = 0;
+static GBool physLayout = gFalse;
static GBool rawOrder = gFalse;
static GBool htmlMeta = gFalse;
static char textEncName[128] = "";
@@ -51,6 +52,8 @@ static ArgDesc argDesc[] = {
"first page to convert"},
{"-l", argInt, &lastPage, 0,
"last page to convert"},
+ {"-layout", argFlag, &physLayout, 0,
+ "maintain original physical layout"},
{"-raw", argFlag, &rawOrder, 0,
"keep strings in content stream order"},
{"-htmlmeta", argFlag, &htmlMeta, 0,
@@ -222,7 +225,8 @@ int main(int argc, char *argv[]) {
}
// write text file
- textOut = new TextOutputDev(textFileName->getCString(), rawOrder, htmlMeta);
+ textOut = new TextOutputDev(textFileName->getCString(),
+ physLayout, rawOrder, htmlMeta);
if (textOut->isOk()) {
doc->displayPages(textOut, firstPage, lastPage, 72, 0, gFalse);
} else {
diff --git a/pdf/xpdf/xpdf.cc b/pdf/xpdf/xpdf.cc
index ef47fb6..290cfe9 100644
--- a/pdf/xpdf/xpdf.cc
+++ b/pdf/xpdf/xpdf.cc
@@ -34,6 +34,7 @@ static char ownerPassword[33] = "";
static char userPassword[33] = "";
static GBool fullScreen = gFalse;
static char remoteName[100] = "xpdf_";
+static GBool doRemoteReload = gFalse;
static GBool doRemoteRaise = gFalse;
static GBool doRemoteQuit = gFalse;
static GBool printCommands = gFalse;
@@ -89,6 +90,8 @@ static ArgDesc argDesc[] = {
"run in full-screen (presentation) mode"},
{"-remote", argString, remoteName + 5, sizeof(remoteName) - 5,
"start/contact xpdf remote server with specified name"},
+ {"-reload", argFlag, &doRemoteReload, 0,
+ "reload xpdf remove server window (with -remote only)"},
{"-raise", argFlag, &doRemoteRaise, 0,
"raise xpdf remote server window (with -remote only)"},
{"-quit", argFlag, &doRemoteQuit, 0,
@@ -184,12 +187,15 @@ int main(int argc, char *argv[]) {
}
// check command line
+ ok = ok && argc >= 1 && argc <= 3;
+ if (doRemoteReload) {
+ ok = ok && remoteName[5] && !doRemoteQuit && argc == 1;
+ }
if (doRemoteRaise) {
- ok = ok && remoteName[5] && !doRemoteQuit && argc >= 1 && argc <= 3;
- } else if (doRemoteQuit) {
+ ok = ok && remoteName[5] && !doRemoteQuit;
+ }
+ if (doRemoteQuit) {
ok = ok && remoteName[5] && argc == 1;
- } else {
- ok = ok && argc >= 1 && argc <= 3;
}
if (!ok || printVersion || printHelp) {
fprintf(stderr, "xpdf version %s\n", xpdfVersion);
@@ -225,6 +231,8 @@ int main(int argc, char *argv[]) {
} else {
app->remoteOpen(fileName, pg, doRemoteRaise);
}
+ } else if (doRemoteReload) {
+ app->remoteReload(doRemoteRaise);
} else if (doRemoteRaise) {
app->remoteRaise();
} else if (doRemoteQuit) {
diff --git a/pdf/xpdf/xpdfconfig.h b/pdf/xpdf/xpdfconfig.h
index bb6eab9..ef1764a 100644
--- a/pdf/xpdf/xpdfconfig.h
+++ b/pdf/xpdf/xpdfconfig.h
@@ -14,10 +14,10 @@
//------------------------------------------------------------------------
// xpdf version
-#define xpdfVersion "2.00"
-#define xpdfVersionNum 2.00
+#define xpdfVersion "2.01"
+#define xpdfVersionNum 2.01
#define xpdfMajorVersion 2
-#define xpdfMinorVersion 0
+#define xpdfMinorVersion 1
#define xpdfMajorVersionStr "2"
// supported PDF version
@@ -28,7 +28,7 @@
#define xpdfCopyright "Copyright 1996-2002 Glyph & Cog, LLC"
// Windows resource file stuff
-#define winxpdfVersion "WinXpdf 2.00"
+#define winxpdfVersion "WinXpdf 2.01"
#define xpdfCopyrightAmp "Copyright 1996-2002 Glyph && Cog, LLC"
//------------------------------------------------------------------------