From 8032fd96d450ac015c0153f1fa57e974d67b4993 Mon Sep 17 00:00:00 2001 From: Martin Kretzschmar Date: Mon, 31 Mar 2003 23:23:17 +0000 Subject: update * ANNOUNCE, CHANGES, README, aconf-win32.h: update * xpdf/CharCodeToUnicode.cc, xpdf/Decrypt.cc, xpdf/FTFont.cc, xpdf/FTFont.h, xpdf/FontEncodingTables.cc, xpdf/Gfx.cc, xpdf/GfxFont.cc, xpdf/GfxState.cc, xpdf/GfxState.h, xpdf/GlobalParams.cc, xpdf/GlobalParams.h, xpdf/Link.cc, xpdf/NameToUnicodeTable.h, xpdf/Stream.cc, xpdf/TextOutputDev.cc, xpdf/TextOutputDev.h, xpdf/XOutputDev.cc, xpdf/config.h, xpdf/pdftotext.cc, xpdf/xpdf.cc, xpdf/Outline.cc, xpdf/XPDFApp.cc, xpdf/XPDFApp.h, xpdf/XPDFCore.cc, xpdf/XPDFCore.h, xpdf/XPDFViewer.cc, xpdf/XPDFViewer.h: update. * goo/gfile.cc: update. * goo/Makefile.am: use GMutex.h * doc/pdffonts.1, doc/pdffonts.cat, doc/pdfimages.1, doc/pdfimages.cat, doc/pdfinfo.1, doc/pdfinfo.cat, doc/pdftopbm.1, doc/pdftopbm.cat, doc/pdftops.1, doc/pdftops.cat, doc/pdftotext.1, doc/pdftotext.cat, doc/pdftotext.hlp, doc/xpdf.1, doc/xpdf.cat, doc/xpdf.hlp, doc/xpdfrc.5, doc/xpdfrc.cat, doc/xpdfrc.hlp: update --- diff --git a/pdf/goo/Makefile.am b/pdf/goo/Makefile.am index 24f89d8..b08ca56 100644 --- a/pdf/goo/Makefile.am +++ b/pdf/goo/Makefile.am @@ -5,6 +5,7 @@ libgoo_a_SOURCES = \ GHash.h \ GList.cc \ GList.h \ + GMutex.h \ GString.cc \ GString.h \ gmempp.cc \ diff --git a/pdf/goo/gfile.cc b/pdf/goo/gfile.cc index d6d2363..e6603c6 100644 --- a/pdf/goo/gfile.cc +++ b/pdf/goo/gfile.cc @@ -447,8 +447,6 @@ GBool openTempFile(GString **name, FILE **f, char *mode, char *ext) { #if defined(WIN32) //---------- Win32 ---------- char *s; - char buf[_MAX_PATH]; - char *fp; if (!(s = _tempnam(getenv("TEMP"), NULL))) { return gFalse; @@ -646,10 +644,8 @@ GDir::~GDir() { } GDirEntry *GDir::getNextEntry() { - struct dirent *ent; GDirEntry *e; - e = NULL; #if defined(WIN32) e = new GDirEntry(path->getCString(), ffd.cFileName, doStat); if (hnd && !FindNextFile(hnd, &ffd)) { @@ -658,24 +654,34 @@ GDirEntry *GDir::getNextEntry() { } #elif defined(ACORN) #elif defined(MACOS) -#else +#elif defined(VMS) + struct dirent *ent; + e = NULL; if (dir) { -#ifdef VMS if (needParent) { e = new GDirEntry(path->getCString(), "-", doStat); needParent = gFalse; return e; } -#endif ent = readdir(dir); -#ifndef VMS - if (ent && !strcmp(ent->d_name, ".")) + if (ent) { + e = new GDirEntry(path->getCString(), ent->d_name, doStat); + } + } +#else + struct dirent *ent; + e = NULL; + if (dir) { + ent = readdir(dir); + if (ent && !strcmp(ent->d_name, ".")) { ent = readdir(dir); -#endif - if (ent) + } + if (ent) { e = new GDirEntry(path->getCString(), ent->d_name, doStat); + } } #endif + return e; } diff --git a/pdf/xpdf/CharCodeToUnicode.cc b/pdf/xpdf/CharCodeToUnicode.cc index f61d400..e2fecbc 100644 --- a/pdf/xpdf/CharCodeToUnicode.cc +++ b/pdf/xpdf/CharCodeToUnicode.cc @@ -224,7 +224,7 @@ void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, map[i] = 0; } } - if (n3 == 6) { + if (n3 <= 6) { if (sscanf(tok3 + 1, "%x", &u) != 1) { error(-1, "Illegal entry in bfrange block in ToUnicode CMap"); continue; diff --git a/pdf/xpdf/Decrypt.cc b/pdf/xpdf/Decrypt.cc index bb3e3f1..b58a6c5 100644 --- a/pdf/xpdf/Decrypt.cc +++ b/pdf/xpdf/Decrypt.cc @@ -382,20 +382,20 @@ static void md5(Guchar *msg, int msgLen, Guchar *digest) { } // break digest into bytes - digest[0] = a & 0xff; - digest[1] = (a >>= 8) & 0xff; - digest[2] = (a >>= 8) & 0xff; - digest[3] = (a >>= 8) & 0xff; - digest[4] = b & 0xff; - digest[5] = (b >>= 8) & 0xff; - digest[6] = (b >>= 8) & 0xff; - digest[7] = (b >>= 8) & 0xff; - digest[8] = c & 0xff; - digest[9] = (c >>= 8) & 0xff; - digest[10] = (c >>= 8) & 0xff; - digest[11] = (c >>= 8) & 0xff; - digest[12] = d & 0xff; - digest[13] = (d >>= 8) & 0xff; - digest[14] = (d >>= 8) & 0xff; - digest[15] = (d >>= 8) & 0xff; + digest[0] = (Guchar)(a & 0xff); + digest[1] = (Guchar)((a >>= 8) & 0xff); + digest[2] = (Guchar)((a >>= 8) & 0xff); + digest[3] = (Guchar)((a >>= 8) & 0xff); + digest[4] = (Guchar)(b & 0xff); + digest[5] = (Guchar)((b >>= 8) & 0xff); + digest[6] = (Guchar)((b >>= 8) & 0xff); + digest[7] = (Guchar)((b >>= 8) & 0xff); + digest[8] = (Guchar)(c & 0xff); + digest[9] = (Guchar)((c >>= 8) & 0xff); + digest[10] = (Guchar)((c >>= 8) & 0xff); + digest[11] = (Guchar)((c >>= 8) & 0xff); + digest[12] = (Guchar)(d & 0xff); + digest[13] = (Guchar)((d >>= 8) & 0xff); + digest[14] = (Guchar)((d >>= 8) & 0xff); + digest[15] = (Guchar)((d >>= 8) & 0xff); } diff --git a/pdf/xpdf/FTFont.cc b/pdf/xpdf/FTFont.cc index 8de09e0..ab101ac 100644 --- a/pdf/xpdf/FTFont.cc +++ b/pdf/xpdf/FTFont.cc @@ -56,6 +56,9 @@ FTFontFile::FTFontFile(FTFontEngine *engineA, char *fontFileName, ok = gFalse; engine = engineA; codeMap = NULL; + cidToGID = NULL; + cidToGIDLen = 0; + if (FT_New_Face(engine->lib, fontFileName, 0, &face)) { return; } @@ -144,11 +147,15 @@ FTFontFile::FTFontFile(FTFontEngine *engineA, char *fontFileName, ok = gFalse; engine = engineA; codeMap = NULL; + cidToGID = NULL; + cidToGIDLen = 0; + if (FT_New_Face(engine->lib, fontFileName, 0, &face)) { return; } - cidToGID = cidToGIDA; cidToGIDLen = cidToGIDLenA; + cidToGID = (Gushort *)gmalloc(cidToGIDLen * sizeof(Gushort)); + memcpy(cidToGID, cidToGIDA, cidToGIDLen * sizeof(Gushort)); mode = ftFontModeCIDToGIDMap; ok = gTrue; } @@ -157,12 +164,17 @@ FTFontFile::FTFontFile(FTFontEngine *engineA, char *fontFileName) { ok = gFalse; engine = engineA; codeMap = NULL; + cidToGID = NULL; + cidToGIDLen = 0; + if (FT_New_Face(engine->lib, fontFileName, 0, &face)) { return; } - cidToGID = NULL; - cidToGIDLen = 0; - mode = ftFontModeCFFCharset; + if (!strcmp(face->driver->root.clazz->module_name, "t1cid")) { + mode = ftFontModeCID; + } else { + mode = ftFontModeCFFCharset; + } ok = gTrue; } @@ -173,6 +185,9 @@ FTFontFile::~FTFontFile() { if (codeMap) { gfree(codeMap); } + if (cidToGID) { + gfree(cidToGID); + } } //------------------------------------------------------------------------ @@ -664,20 +679,25 @@ FT_UInt FTFont::getGlyphIndex(CharCode c, Unicode u) { break; case ftFontModeCFFCharset: #if 1 //~ cff cid->gid map + { #if FREETYPE_MAJOR == 2 && FREETYPE_MINOR == 0 - CFF_Font *cff = (CFF_Font *)((TT_Face)fontFile->face)->extra.data; + CFF_Font *cff = (CFF_Font *)((TT_Face)fontFile->face)->extra.data; #else - CFF_Font cff = (CFF_Font)((TT_Face)fontFile->face)->extra.data; + CFF_Font cff = (CFF_Font)((TT_Face)fontFile->face)->extra.data; #endif - idx = 0; - for (j = 0; j < (int)cff->num_glyphs; ++j) { - if (cff->charset.sids[j] == c) { - idx = j; - break; + idx = 0; + for (j = 0; j < (int)cff->num_glyphs; ++j) { + if (cff->charset.sids[j] == c) { + idx = j; + break; + } } } #endif break; + case ftFontModeCID: + idx = c; + break; } return idx; } diff --git a/pdf/xpdf/FTFont.h b/pdf/xpdf/FTFont.h index 02c257a..32675c6 100644 --- a/pdf/xpdf/FTFont.h +++ b/pdf/xpdf/FTFont.h @@ -53,7 +53,8 @@ enum FTFontIndexMode { ftFontModeCodeMap, ftFontModeCodeMapDirect, ftFontModeCIDToGIDMap, - ftFontModeCFFCharset + ftFontModeCFFCharset, + ftFontModeCID }; class FTFontFile: public SFontFile { diff --git a/pdf/xpdf/Gfx.cc b/pdf/xpdf/Gfx.cc index 2717a04..21136b1 100644 --- a/pdf/xpdf/Gfx.cc +++ b/pdf/xpdf/Gfx.cc @@ -1825,7 +1825,7 @@ void Gfx::doRadialShFill(GfxRadialShading *shading) { } void Gfx::doEndPath() { - if (state->isPath() && clip != clipNone) { + if (state->isCurPt() && clip != clipNone) { state->clip(); if (clip == clipNormal) { out->clip(state); @@ -2038,7 +2038,7 @@ void Gfx::doShowText(GString *s) { double riseX, riseY; CharCode code; Unicode u[8]; - double x, y, dx, dy, dx2, dy2, curX, curY, tdx, tdy; + double x, y, dx, dy, dx2, dy2, curX, curY, tdx, tdy, lineX, lineY; double originX, originY, tOriginX, tOriginY; double oldCTM[6], newCTM[6]; double *mat; @@ -2082,6 +2082,8 @@ void Gfx::doShowText(GString *s) { state->textTransformDelta(0, state->getRise(), &riseX, &riseY); curX = state->getCurX(); curY = state->getCurY(); + lineX = state->getLineX(); + lineY = state->getLineY(); oldParser = parser; p = s->getCString(); len = s->getLength(); @@ -2120,10 +2122,11 @@ void Gfx::doShowText(GString *s) { state = state->restore(); out->restoreState(state); // GfxState::restore() does *not* restore the current position, - // so we track it here with (curX, curY) + // so we deal with it here using (curX, curY) and (lineX, lineY) curX += tdx; curY += tdy; state->moveTo(curX, curY); + state->textSetPos(lineX, lineY); p += n; len -= n; } diff --git a/pdf/xpdf/GfxFont.cc b/pdf/xpdf/GfxFont.cc index 5acb845..b3b6a71 100644 --- a/pdf/xpdf/GfxFont.cc +++ b/pdf/xpdf/GfxFont.cc @@ -66,6 +66,9 @@ static StdFontMapEntry stdFontMap[] = { { "Helvetica,Italic", "Helvetica-Oblique" }, { "Helvetica-BoldItalic", "Helvetica-BoldOblique" }, { "Helvetica-Italic", "Helvetica-Oblique" }, + { "Symbol,Bold", "Symbol" }, + { "Symbol,BoldItalic", "Symbol" }, + { "Symbol,Italic", "Symbol" }, { "TimesNewRoman", "Times-Roman" }, { "TimesNewRoman,Bold", "Times-Bold" }, { "TimesNewRoman,BoldItalic", "Times-BoldItalic" }, @@ -256,6 +259,10 @@ void GfxFont::readFontDescriptor(XRef *xref, Dict *fontDict) { if (t != 0) { descent = t; } + // some broken font descriptors specify a positive descent + if (descent > 0) { + descent = -descent; + } } obj2.free(); @@ -949,7 +956,7 @@ GfxCIDFont::GfxCIDFont(XRef *xref, char *tagA, Ref idA, GString *nameA, // CIDToGIDMap (for embedded TrueType fonts) if (type == fontCIDType2) { - fontDict->lookup("CIDToGIDMap", &obj1); + desFontDict->lookup("CIDToGIDMap", &obj1); if (obj1.isStream()) { cidToGIDLen = 0; i = 64; diff --git a/pdf/xpdf/GfxState.cc b/pdf/xpdf/GfxState.cc index d968ac1..a978b50 100644 --- a/pdf/xpdf/GfxState.cc +++ b/pdf/xpdf/GfxState.cc @@ -29,6 +29,24 @@ static inline double clip01(double x) { } //------------------------------------------------------------------------ + +static char *gfxColorSpaceModeNames[] = { + "DeviceGray", + "CalGray", + "DeviceRGB", + "CalRGB", + "DeviceCMYK", + "Lab", + "ICCBased", + "Indexed", + "Separation", + "DeviceN", + "Pattern" +}; + +#define nGfxColorSpaceModes ((sizeof(gfxColorSpaceModeNames) / sizeof(char *))) + +//------------------------------------------------------------------------ // GfxColorSpace //------------------------------------------------------------------------ @@ -99,6 +117,14 @@ void GfxColorSpace::getDefaultRanges(double *decodeLow, double *decodeRange, } } +int GfxColorSpace::getNumColorSpaceModes() { + return nGfxColorSpaceModes; +} + +char *GfxColorSpace::getColorSpaceModeName(int idx) { + return gfxColorSpaceModeNames[idx]; +} + //------------------------------------------------------------------------ // GfxDeviceGrayColorSpace //------------------------------------------------------------------------ @@ -850,9 +876,9 @@ GfxColorSpace *GfxIndexedColorSpace::parse(Array *arr) { return NULL; } -void GfxIndexedColorSpace::getGray(GfxColor *color, double *gray) { +GfxColor *GfxIndexedColorSpace::mapColorToBase(GfxColor *color, + GfxColor *baseColor) { Guchar *p; - GfxColor color2; double low[gfxColorMaxComps], range[gfxColorMaxComps]; int n, i; @@ -860,39 +886,27 @@ void GfxIndexedColorSpace::getGray(GfxColor *color, double *gray) { base->getDefaultRanges(low, range, indexHigh); p = &lookup[(int)(color->c[0] + 0.5) * n]; for (i = 0; i < n; ++i) { - color2.c[i] = low[i] + (p[i] / 255.0) * range[i]; + baseColor->c[i] = low[i] + (p[i] / 255.0) * range[i]; } - base->getGray(&color2, gray); + return baseColor; +} + +void GfxIndexedColorSpace::getGray(GfxColor *color, double *gray) { + GfxColor color2; + + base->getGray(mapColorToBase(color, &color2), gray); } void GfxIndexedColorSpace::getRGB(GfxColor *color, GfxRGB *rgb) { - Guchar *p; GfxColor color2; - double low[gfxColorMaxComps], range[gfxColorMaxComps]; - int n, i; - n = base->getNComps(); - base->getDefaultRanges(low, range, indexHigh); - p = &lookup[(int)(color->c[0] + 0.5) * n]; - for (i = 0; i < n; ++i) { - color2.c[i] = low[i] + (p[i] / 255.0) * range[i]; - } - base->getRGB(&color2, rgb); + base->getRGB(mapColorToBase(color, &color2), rgb); } void GfxIndexedColorSpace::getCMYK(GfxColor *color, GfxCMYK *cmyk) { - Guchar *p; GfxColor color2; - double low[gfxColorMaxComps], range[gfxColorMaxComps]; - int n, i; - n = base->getNComps(); - base->getDefaultRanges(low, range, indexHigh); - p = &lookup[(int)(color->c[0] + 0.5) * n]; - for (i = 0; i < n; ++i) { - color2.c[i] = low[i] + (p[i] / 255.0) * range[i]; - } - base->getCMYK(&color2, cmyk); + base->getCMYK(mapColorToBase(color, &color2), cmyk); } void GfxIndexedColorSpace::getDefaultRanges(double *decodeLow, @@ -1789,6 +1803,15 @@ void GfxImageColorMap::getCMYK(Guchar *x, GfxCMYK *cmyk) { } } +void GfxImageColorMap::getColor(Guchar *x, GfxColor *color) { + int maxPixel, i; + + maxPixel = (1 << bits) - 1; + for (i = 0; i < nComps; ++i) { + color->c[i] = decodeLow[i] + (x[i] * decodeRange[i]) / maxPixel; + } +} + //------------------------------------------------------------------------ // GfxSubpath and GfxPath //------------------------------------------------------------------------ diff --git a/pdf/xpdf/GfxState.h b/pdf/xpdf/GfxState.h index e99735c..cfe8f9b 100644 --- a/pdf/xpdf/GfxState.h +++ b/pdf/xpdf/GfxState.h @@ -53,6 +53,8 @@ struct GfxCMYK { // GfxColorSpace //------------------------------------------------------------------------ +// NB: The nGfxColorSpaceModes constant and the gfxColorSpaceModeNames +// array defined in GfxState.cc must match this enum. enum GfxColorSpaceMode { csDeviceGray, csCalGray, @@ -91,6 +93,12 @@ public: virtual void getDefaultRanges(double *decodeLow, double *decodeRange, int maxImgPixel); + // Return the number of color space modes + static int getNumColorSpaceModes(); + + // Return the name of the th color space mode. + static char *getColorSpaceModeName(int idx); + private: }; @@ -344,6 +352,7 @@ public: GfxColorSpace *getBase() { return base; } int getIndexHigh() { return indexHigh; } Guchar *getLookup() { return lookup; } + GfxColor *mapColorToBase(GfxColor *color, GfxColor *baseColor); private: @@ -636,6 +645,7 @@ public: void getGray(Guchar *x, double *gray); void getRGB(Guchar *x, GfxRGB *rgb); void getCMYK(Guchar *x, GfxCMYK *cmyk); + void getColor(Guchar *x, GfxColor *color); private: @@ -902,6 +912,7 @@ public: void clip(); // Text position. + void textSetPos(double tx, double ty) { lineX = tx; lineY = ty; } void textMoveTo(double tx, double ty) { lineX = tx; lineY = ty; textTransform(tx, ty, &curX, &curY); } void textShift(double tx, double ty); diff --git a/pdf/xpdf/GlobalParams.cc b/pdf/xpdf/GlobalParams.cc index b50c15b..ded583f 100644 --- a/pdf/xpdf/GlobalParams.cc +++ b/pdf/xpdf/GlobalParams.cc @@ -31,6 +31,14 @@ #include "FontEncodingTables.h" #include "GlobalParams.h" +#if MULTITHREADED +# define globalParamsLock gLockMutex(&mutex) +# define globalParamsUnlock gUnlockMutex(&mutex) +#else +# define globalParamsLock +# define globalParamsUnlock +#endif + #include "NameToUnicodeTable.h" #include "UnicodeMapTables.h" #include "DisplayFontTable.h" @@ -124,6 +132,10 @@ GlobalParams::GlobalParams(char *cfgFileName) { FILE *f; int i; +#if MULTITHREADED + gInitMutex(&mutex); +#endif + initBuiltinFontTables(); // scan the encoding in reverse because we want the lowest-numbered @@ -276,7 +288,7 @@ void GlobalParams::parseFile(GString *fileName, FILE *f) { FILE *f2; line = 1; - while (fgets(buf, sizeof(buf) - 1, f)) { + while (getLine(buf, sizeof(buf) - 1, f)) { // break the line into tokens tokens = new GList(); @@ -293,7 +305,7 @@ void GlobalParams::parseFile(GString *fileName, FILE *f) { for (p2 = p1 + 1; *p2 && !isspace(*p2); ++p2) ; } tokens->append(new GString(p1, p2 - p1)); - p1 = p2 + 1; + p1 = *p2 ? p2 + 1 : p2; } if (tokens->getLength() > 0 && @@ -329,12 +341,18 @@ void GlobalParams::parseFile(GString *fileName, FILE *f) { parseDisplayFont(tokens, displayFonts, displayFontT1, fileName, line); } else if (!cmd->cmp("displayFontTT")) { parseDisplayFont(tokens, displayFonts, displayFontTT, fileName, line); + } else if (!cmd->cmp("displayNamedCIDFontX")) { + parseDisplayFont(tokens, displayNamedCIDFonts, + displayFontX, fileName, line); } else if (!cmd->cmp("displayCIDFontX")) { parseDisplayFont(tokens, displayCIDFonts, displayFontX, fileName, line); - } else if (!cmd->cmp("displayNamedCIDFontX")) { + } else if (!cmd->cmp("displayNamedCIDFontT1")) { parseDisplayFont(tokens, displayNamedCIDFonts, - displayFontX, fileName, line); + displayFontT1, fileName, line); + } else if (!cmd->cmp("displayCIDFontT1")) { + parseDisplayFont(tokens, displayCIDFonts, + displayFontT1, fileName, line); } else if (!cmd->cmp("psFile")) { parsePSFile(tokens, fileName, line); } else if (!cmd->cmp("psFont")) { @@ -428,7 +446,7 @@ void GlobalParams::parseNameToUnicode(GList *tokens, GString *fileName, return; } line2 = 1; - while (fgets(buf, sizeof(buf), f)) { + while (getLine(buf, sizeof(buf), f)) { tok1 = strtok(buf, " \t\r\n"); tok2 = strtok(NULL, " \t\r\n"); if (tok1 && tok2) { @@ -794,6 +812,10 @@ GlobalParams::~GlobalParams() { delete cidToUnicodeCache; delete unicodeMapCache; delete cMapCache; + +#if MULTITHREADED + gDestroyMutex(&mutex); +#endif } //------------------------------------------------------------------------ @@ -870,7 +892,12 @@ FILE *GlobalParams::findToUnicodeFile(GString *name) { } DisplayFontParam *GlobalParams::getDisplayFont(GString *fontName) { - return (DisplayFontParam *)displayFonts->lookup(fontName); + DisplayFontParam *dfp; + + globalParamsLock; + dfp = (DisplayFontParam *)displayFonts->lookup(fontName); + globalParamsUnlock; + return dfp; } DisplayFontParam *GlobalParams::getDisplayCIDFont(GString *fontName, @@ -884,6 +911,51 @@ DisplayFontParam *GlobalParams::getDisplayCIDFont(GString *fontName, return dfp; } +GString *GlobalParams::getPSFile() { + GString *s; + + globalParamsLock; + s = psFile ? psFile->copy() : (GString *)NULL; + globalParamsUnlock; + return s; +} + +int GlobalParams::getPSPaperWidth() { + int w; + + globalParamsLock; + w = psPaperWidth; + globalParamsUnlock; + return w; +} + +int GlobalParams::getPSPaperHeight() { + int h; + + globalParamsLock; + h = psPaperHeight; + globalParamsUnlock; + return h; +} + +GBool GlobalParams::getPSDuplex() { + GBool d; + + globalParamsLock; + d = psDuplex; + globalParamsUnlock; + return d; +} + +PSLevel GlobalParams::getPSLevel() { + PSLevel level; + + globalParamsLock; + level = psLevel; + globalParamsUnlock; + return level; +} + PSFontParam *GlobalParams::getPSFont(GString *fontName) { return (PSFontParam *)psFonts->lookup(fontName); } @@ -917,6 +989,78 @@ PSFontParam *GlobalParams::getPSFont16(GString *fontName, return p; } +GBool GlobalParams::getPSEmbedType1() { + GBool e; + + globalParamsLock; + e = psEmbedType1; + globalParamsUnlock; + return e; +} + +GBool GlobalParams::getPSEmbedTrueType() { + GBool e; + + globalParamsLock; + e = psEmbedTrueType; + globalParamsUnlock; + return e; +} + +GBool GlobalParams::getPSEmbedCIDPostScript() { + GBool e; + + globalParamsLock; + e = psEmbedCIDPostScript; + globalParamsUnlock; + return e; +} + +GBool GlobalParams::getPSEmbedCIDTrueType() { + GBool e; + + globalParamsLock; + e = psEmbedCIDTrueType; + globalParamsUnlock; + return e; +} + +GBool GlobalParams::getPSOPI() { + GBool opi; + + globalParamsLock; + opi = psOPI; + globalParamsUnlock; + return opi; +} + +GBool GlobalParams::getPSASCIIHex() { + GBool ah; + + globalParamsLock; + ah = psASCIIHex; + globalParamsUnlock; + return ah; +} + +EndOfLineKind GlobalParams::getTextEOL() { + EndOfLineKind eol; + + globalParamsLock; + eol = textEOL; + globalParamsUnlock; + return eol; +} + +GBool GlobalParams::getTextKeepTinyChars() { + GBool tiny; + + globalParamsLock; + tiny = textKeepTinyChars; + globalParamsUnlock; + return tiny; +} + GString *GlobalParams::findFontFile(GString *fontName, char *ext1, char *ext2) { GString *dir, *fileName; @@ -947,26 +1091,105 @@ GString *GlobalParams::findFontFile(GString *fontName, return NULL; } +GString *GlobalParams::getInitialZoom() { + GString *s; + + globalParamsLock; + s = initialZoom->copy(); + globalParamsUnlock; + return s; +} + +FontRastControl GlobalParams::getT1libControl() { + FontRastControl c; + + globalParamsLock; + c = t1libControl; + globalParamsUnlock; + return c; +} + +FontRastControl GlobalParams::getFreeTypeControl() { + FontRastControl c; + + globalParamsLock; + c = freetypeControl; + globalParamsUnlock; + return c; +} + +GBool GlobalParams::getMapNumericCharNames() { + GBool map; + + globalParamsLock; + map = mapNumericCharNames; + globalParamsUnlock; + return map; +} + +GBool GlobalParams::getPrintCommands() { + GBool p; + + globalParamsLock; + p = printCommands; + globalParamsUnlock; + return p; +} + +GBool GlobalParams::getErrQuiet() { + GBool q; + + globalParamsLock; + q = errQuiet; + globalParamsUnlock; + return q; +} + CharCodeToUnicode *GlobalParams::getCIDToUnicode(GString *collection) { - return cidToUnicodeCache->getCIDToUnicode(collection); + CharCodeToUnicode *ctu; + + globalParamsLock; + ctu = cidToUnicodeCache->getCIDToUnicode(collection); + globalParamsUnlock; + return ctu; } UnicodeMap *GlobalParams::getUnicodeMap(GString *encodingName) { UnicodeMap *map; + globalParamsLock; + map = getUnicodeMap2(encodingName); + globalParamsUnlock; + return map; +} + +UnicodeMap *GlobalParams::getUnicodeMap2(GString *encodingName) { + UnicodeMap *map; + if ((map = getResidentUnicodeMap(encodingName))) { map->incRefCnt(); - return map; + } else { + map = unicodeMapCache->getUnicodeMap(encodingName); } - return unicodeMapCache->getUnicodeMap(encodingName); + return map; } CMap *GlobalParams::getCMap(GString *collection, GString *cMapName) { - return cMapCache->getCMap(collection, cMapName); + CMap *cMap; + + globalParamsLock; + cMap = cMapCache->getCMap(collection, cMapName); + globalParamsUnlock; + return cMap; } UnicodeMap *GlobalParams::getTextEncoding() { - return getUnicodeMap(textEncoding); + UnicodeMap *map; + + globalParamsLock; + map = getUnicodeMap2(textEncoding); + globalParamsUnlock; + return map; } //------------------------------------------------------------------------ @@ -976,20 +1199,25 @@ UnicodeMap *GlobalParams::getTextEncoding() { void GlobalParams::addDisplayFont(DisplayFontParam *param) { DisplayFontParam *old; + globalParamsLock; if ((old = (DisplayFontParam *)displayFonts->remove(param->name))) { delete old; } displayFonts->add(param->name, param); + globalParamsUnlock; } void GlobalParams::setPSFile(char *file) { + globalParamsLock; if (psFile) { delete psFile; } psFile = new GString(file); + globalParamsUnlock; } GBool GlobalParams::setPSPaperSize(char *size) { + globalParamsLock; if (!strcmp(size, "letter")) { psPaperWidth = 612; psPaperHeight = 792; @@ -1003,57 +1231,82 @@ GBool GlobalParams::setPSPaperSize(char *size) { psPaperWidth = 842; psPaperHeight = 1190; } else { + globalParamsUnlock; return gFalse; } + globalParamsUnlock; return gTrue; } void GlobalParams::setPSPaperWidth(int width) { + globalParamsLock; psPaperWidth = width; + globalParamsUnlock; } void GlobalParams::setPSPaperHeight(int height) { + globalParamsLock; psPaperHeight = height; + globalParamsUnlock; } void GlobalParams::setPSDuplex(GBool duplex) { + globalParamsLock; psDuplex = duplex; + globalParamsUnlock; } void GlobalParams::setPSLevel(PSLevel level) { + globalParamsLock; psLevel = level; + globalParamsUnlock; } void GlobalParams::setPSEmbedType1(GBool embed) { + globalParamsLock; psEmbedType1 = embed; + globalParamsUnlock; } void GlobalParams::setPSEmbedTrueType(GBool embed) { + globalParamsLock; psEmbedTrueType = embed; + globalParamsUnlock; } void GlobalParams::setPSEmbedCIDPostScript(GBool embed) { + globalParamsLock; psEmbedCIDPostScript = embed; + globalParamsUnlock; } void GlobalParams::setPSEmbedCIDTrueType(GBool embed) { + globalParamsLock; psEmbedCIDTrueType = embed; + globalParamsUnlock; } void GlobalParams::setPSOPI(GBool opi) { + globalParamsLock; psOPI = opi; + globalParamsUnlock; } void GlobalParams::setPSASCIIHex(GBool hex) { + globalParamsLock; psASCIIHex = hex; + globalParamsUnlock; } void GlobalParams::setTextEncoding(char *encodingName) { + globalParamsLock; delete textEncoding; textEncoding = new GString(encodingName); + globalParamsUnlock; } GBool GlobalParams::setTextEOL(char *s) { + globalParamsLock; if (!strcmp(s, "unix")) { textEOL = eolUnix; } else if (!strcmp(s, "dos")) { @@ -1061,26 +1314,42 @@ GBool GlobalParams::setTextEOL(char *s) { } else if (!strcmp(s, "mac")) { textEOL = eolMac; } else { + globalParamsUnlock; return gFalse; } + globalParamsUnlock; return gTrue; } void GlobalParams::setTextKeepTinyChars(GBool keep) { + globalParamsLock; textKeepTinyChars = keep; + globalParamsUnlock; } void GlobalParams::setInitialZoom(char *s) { + globalParamsLock; delete initialZoom; initialZoom = new GString(s); + globalParamsUnlock; } GBool GlobalParams::setT1libControl(char *s) { - return setFontRastControl(&t1libControl, s); + GBool ok; + + globalParamsLock; + ok = setFontRastControl(&t1libControl, s); + globalParamsUnlock; + return ok; } GBool GlobalParams::setFreeTypeControl(char *s) { - return setFontRastControl(&freetypeControl, s); + GBool ok; + + globalParamsLock; + ok = setFontRastControl(&freetypeControl, s); + globalParamsUnlock; + return ok; } GBool GlobalParams::setFontRastControl(FontRastControl *val, char *s) { @@ -1099,13 +1368,19 @@ GBool GlobalParams::setFontRastControl(FontRastControl *val, char *s) { } void GlobalParams::setMapNumericCharNames(GBool map) { + globalParamsLock; mapNumericCharNames = map; + globalParamsUnlock; } void GlobalParams::setPrintCommands(GBool printCommandsA) { + globalParamsLock; printCommands = printCommandsA; + globalParamsUnlock; } void GlobalParams::setErrQuiet(GBool errQuietA) { + globalParamsLock; errQuiet = errQuietA; + globalParamsUnlock; } diff --git a/pdf/xpdf/GlobalParams.h b/pdf/xpdf/GlobalParams.h index 0f783e8..5fb3be3 100644 --- a/pdf/xpdf/GlobalParams.h +++ b/pdf/xpdf/GlobalParams.h @@ -19,6 +19,10 @@ #include "gtypes.h" #include "CharTypes.h" +#if MULTITHREADED +#include "GMutex.h" +#endif + class GString; class GList; class GHash; @@ -137,31 +141,30 @@ public: FILE *findToUnicodeFile(GString *name); DisplayFontParam *getDisplayFont(GString *fontName); DisplayFontParam *getDisplayCIDFont(GString *fontName, GString *collection); - GString *getPSFile() { return psFile; } - int getPSPaperWidth() { return psPaperWidth; } - int getPSPaperHeight() { return psPaperHeight; } - GBool getPSDuplex() { return psDuplex; } - PSLevel getPSLevel() { return psLevel; } + GString *getPSFile(); + int getPSPaperWidth(); + int getPSPaperHeight(); + GBool getPSDuplex(); + PSLevel getPSLevel(); PSFontParam *getPSFont(GString *fontName); PSFontParam *getPSFont16(GString *fontName, GString *collection, int wMode); - GBool getPSEmbedType1() { return psEmbedType1; } - GBool getPSEmbedTrueType() { return psEmbedTrueType; } - GBool getPSEmbedCIDPostScript() { return psEmbedCIDPostScript; } - GBool getPSEmbedCIDTrueType() { return psEmbedCIDTrueType; } - GBool getPSOPI() { return psOPI; } - GBool getPSASCIIHex() { return psASCIIHex; } - GString *getTextEncodingName() { return textEncoding; } - EndOfLineKind getTextEOL() { return textEOL; } - GBool getTextKeepTinyChars() { return textKeepTinyChars; } + GBool getPSEmbedType1(); + GBool getPSEmbedTrueType(); + GBool getPSEmbedCIDPostScript(); + GBool getPSEmbedCIDTrueType(); + GBool getPSOPI(); + GBool getPSASCIIHex(); + EndOfLineKind getTextEOL(); + GBool getTextKeepTinyChars(); GString *findFontFile(GString *fontName, char *ext1, char *ext2); - GString *getInitialZoom() { return initialZoom; } - FontRastControl getT1libControl() { return t1libControl; } - FontRastControl getFreeTypeControl() { return freetypeControl; } + GString *getInitialZoom(); + FontRastControl getT1libControl(); + FontRastControl getFreeTypeControl(); GString *getURLCommand() { return urlCommand; } GString *getMovieCommand() { return movieCommand; } - GBool getMapNumericCharNames() { return mapNumericCharNames; } - GBool getPrintCommands() { return printCommands; } - GBool getErrQuiet() { return errQuiet; } + GBool getMapNumericCharNames(); + GBool getPrintCommands(); + GBool getErrQuiet(); CharCodeToUnicode *getCIDToUnicode(GString *collection); UnicodeMap *getUnicodeMap(GString *encodingName); @@ -220,6 +223,7 @@ private: GList *tokens, GString *fileName, int line); void parseYesNo(char *cmdName, GBool *flag, GList *tokens, GString *fileName, int line); + UnicodeMap *getUnicodeMap2(GString *encodingName); GBool setFontRastControl(FontRastControl *val, char *s); //----- static tables @@ -281,6 +285,10 @@ private: CIDToUnicodeCache *cidToUnicodeCache; UnicodeMapCache *unicodeMapCache; CMapCache *cMapCache; + +#ifdef MULTITHREADED + GMutex mutex; +#endif }; #endif diff --git a/pdf/xpdf/Link.cc b/pdf/xpdf/Link.cc index b16563a..0c3a869 100644 --- a/pdf/xpdf/Link.cc +++ b/pdf/xpdf/Link.cc @@ -170,44 +170,52 @@ LinkDest::LinkDest(Array *a) { // XYZ link if (obj1.isName("XYZ")) { - if (a->getLength() != 5) { - error(-1, "Annotation destination array has wrong length"); - goto err2; - } kind = destXYZ; - a->get(2, &obj2); - if (obj2.isNull()) { + if (a->getLength() < 3) { changeLeft = gFalse; - } else if (obj2.isNum()) { - changeLeft = gTrue; - left = obj2.getNum(); } else { - error(-1, "Bad annotation destination position"); - goto err1; + a->get(2, &obj2); + if (obj2.isNull()) { + changeLeft = gFalse; + } else if (obj2.isNum()) { + changeLeft = gTrue; + left = obj2.getNum(); + } else { + error(-1, "Bad annotation destination position"); + goto err1; + } + obj2.free(); } - obj2.free(); - a->get(3, &obj2); - if (obj2.isNull()) { + if (a->getLength() < 4) { changeTop = gFalse; - } else if (obj2.isNum()) { - changeTop = gTrue; - top = obj2.getNum(); } else { - error(-1, "Bad annotation destination position"); - goto err1; + a->get(3, &obj2); + if (obj2.isNull()) { + changeTop = gFalse; + } else if (obj2.isNum()) { + changeTop = gTrue; + top = obj2.getNum(); + } else { + error(-1, "Bad annotation destination position"); + goto err1; + } + obj2.free(); } - obj2.free(); - a->get(4, &obj2); - if (obj2.isNull()) { + if (a->getLength() < 5) { changeZoom = gFalse; - } else if (obj2.isNum()) { - changeZoom = gTrue; - zoom = obj2.getNum(); } else { - error(-1, "Bad annotation destination position"); - goto err1; + a->get(4, &obj2); + if (obj2.isNull()) { + changeZoom = gFalse; + } else if (obj2.isNum()) { + changeZoom = gTrue; + zoom = obj2.getNum(); + } else { + error(-1, "Bad annotation destination position"); + goto err1; + } + obj2.free(); } - obj2.free(); // Fit link } else if (obj1.isName("Fit")) { diff --git a/pdf/xpdf/NameToUnicodeTable.h b/pdf/xpdf/NameToUnicodeTable.h index 320c558..99bcf1d 100644 --- a/pdf/xpdf/NameToUnicodeTable.h +++ b/pdf/xpdf/NameToUnicodeTable.h @@ -684,8 +684,8 @@ static struct { {0xf6e2, "commasuperior"}, {0x2245, "congruent"}, {0x00a9, "copyright"}, - {0xf8e9, "copyrightsans"}, - {0xf6d9, "copyrightserif"}, + {0x00a9, "copyrightsans"}, + {0x00a9, "copyrightserif"}, {0x00a4, "currency"}, {0xf6d1, "cyrBreve"}, {0xf6d2, "cyrFlex"}, @@ -972,8 +972,8 @@ static struct { {0x2286, "reflexsubset"}, {0x2287, "reflexsuperset"}, {0x00ae, "registered"}, - {0xf8e8, "registersans"}, - {0xf6da, "registerserif"}, + {0x00ae, "registersans"}, + {0x00ae, "registerserif"}, {0x2310, "revlogicalnot"}, {0x03c1, "rho"}, {0x02da, "ring"}, @@ -1031,8 +1031,8 @@ static struct { {0x0303, "tildecomb"}, {0x0384, "tonos"}, {0x2122, "trademark"}, - {0xf8ea, "trademarksans"}, - {0xf6db, "trademarkserif"}, + {0x2122, "trademarksans"}, + {0x2122, "trademarkserif"}, {0x25bc, "triagdn"}, {0x25c4, "triaglf"}, {0x25ba, "triagrt"}, diff --git a/pdf/xpdf/Stream.cc b/pdf/xpdf/Stream.cc index 0d19d4d..b2abef8 100644 --- a/pdf/xpdf/Stream.cc +++ b/pdf/xpdf/Stream.cc @@ -467,7 +467,7 @@ GBool StreamPredictor::getNextLine() { upLeftBuf[1] = upLeftBuf[0]; upLeftBuf[0] = predLine[i]; if ((c = str->getRawChar()) == EOF) { - break; + return gFalse; } switch (curPred) { case 11: // PNG sub @@ -506,7 +506,6 @@ GBool StreamPredictor::getNextLine() { } // apply TIFF (component) predictor - //~ this is completely untested if (predictor == 2) { if (nBits == 1) { inBuf = predLine[pixBytes - 1]; diff --git a/pdf/xpdf/TextOutputDev.cc b/pdf/xpdf/TextOutputDev.cc index 891752c..b782b42 100644 --- a/pdf/xpdf/TextOutputDev.cc +++ b/pdf/xpdf/TextOutputDev.cc @@ -17,8 +17,9 @@ #include #include #include -#include "GString.h" #include "gmem.h" +#include "GString.h" +#include "GList.h" #include "config.h" #include "Error.h" #include "GlobalParams.h" @@ -32,103 +33,153 @@ #endif //------------------------------------------------------------------------ - -#define textOutSpace 0.2 -#define textOutColSpace 0.2 - +// parameters //------------------------------------------------------------------------ -struct TextOutColumnEdge { - double x, y0, y1; -}; +// Minium and maximum inter-word spacing (as a fraction of the average +// character width). +#define wordMinSpaceWidth 0.3 +#define wordMaxSpaceWidth 2.0 + +// Default min and max inter-word spacing (when the average character +// width is unknown). +#define wordDefMinSpaceWidth 0.2 +#define wordDefMaxSpaceWidth 1.5 + +// Max difference in x,y coordinates (as a fraction of the font size) +// allowed for duplicated text (fake boldface, drop shadows) which is +// to be discarded. +#define dupMaxDeltaX 0.2 +#define dupMaxDeltaY 0.2 + +// Min overlap (as a fraction of the font size) required for two +// lines to be considered vertically overlapping. +#define lineOverlapSlack 0.5 + +// Max difference in baseline y coordinates (as a fraction of the font +// size) allowed for words which are to be grouped into a line, not +// including sub/superscripts. +#define lineMaxBaselineDelta 0.1 + +// Max ratio of font sizes allowed for words which are to be grouped +// into a line, not including sub/superscripts. +#define lineMaxFontSizeRatio 1.4 + +// Min spacing (as a fraction of the font size) allowed between words +// which are to be grouped into a line. +#define lineMinDeltaX -0.5 + +// Minimum vertical overlap (as a fraction of the font size) required +// for superscript and subscript words. +#define lineMinSuperscriptOverlap 0.3 +#define lineMinSubscriptOverlap 0.3 + +// Min/max ratio of font sizes allowed for sub/superscripts compared to +// the base text. +#define lineMinSubscriptFontSizeRatio 0.4 +#define lineMaxSubscriptFontSizeRatio 1.01 +#define lineMinSuperscriptFontSizeRatio 0.4 +#define lineMaxSuperscriptFontSizeRatio 1.01 + +// Max horizontal spacing (as a fraction of the font size) allowed +// before sub/superscripts. +#define lineMaxSubscriptDeltaX 0.2 +#define lineMaxSuperscriptDeltaX 0.2 + +// Maximum vertical spacing (as a fraction of the font size) allowed +// for lines which are to be grouped into a block. +#define blkMaxSpacing 2.0 + +// Max ratio of primary font sizes allowed for lines which are to be +// grouped into a block. +#define blkMaxFontSizeRatio 1.3 + +// Min overlap (as a fraction of the font size) required for two +// blocks to be considered vertically overlapping. +#define blkOverlapSlack 0.5 + +// Max vertical spacing (as a fraction of the font size) allowed +// between blocks which are 'adjacent' when sorted by reading order. +#define blkMaxSortSpacing 2.0 + +// Max vertical offset (as a fraction of the font size) of the top and +// bottom edges allowed for blocks which are to be grouped into a +// flow. +#define flowMaxDeltaY 1.0 //------------------------------------------------------------------------ -// TextBlock +// TextFontInfo //------------------------------------------------------------------------ -class TextBlock { -public: - - TextBlock(); - ~TextBlock(); - - double xMin, xMax; - double yMin, yMax; - TextString *strings; // list of strings in the block - TextBlock *next; // next block in line - TextBlock *xyNext; // next block on xyBlocks list - Unicode *text; // Unicode text of the block, including - // spaces between strings - double *xRight; // right-hand x coord of each char - int len; // total number of Unicode characters - int convertedLen; // total number of converted characters - int *col; // starting column number for each - // Unicode character -}; - -TextBlock::TextBlock() { - strings = NULL; - next = NULL; - xyNext = NULL; - text = NULL; - xRight = NULL; - col = NULL; -} - -TextBlock::~TextBlock() { - TextString *p1, *p2; +TextFontInfo::TextFontInfo(GfxState *state) { + double *textMat; + double t1, t2, avgWidth, w; + int n, i; - for (p1 = strings; p1; p1 = p2) { - p2 = p1->next; - delete p1; + gfxFont = state->getFont(); + textMat = state->getTextMat(); + horizScaling = state->getHorizScaling(); + if ((t1 = fabs(textMat[0])) > 0.01 && + (t2 = fabs(textMat[3])) > 0.01) { + horizScaling *= t1 / t2; } - gfree(text); - gfree(xRight); - gfree(col); -} - -//------------------------------------------------------------------------ -// TextLine -//------------------------------------------------------------------------ - -class TextLine { -public: - TextLine(); - ~TextLine(); + if (!gfxFont) { + minSpaceWidth = horizScaling * wordDefMinSpaceWidth; + maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth; + } else if (gfxFont->isCIDFont()) { + //~ handle 16-bit fonts + minSpaceWidth = horizScaling * wordDefMinSpaceWidth; + maxSpaceWidth = horizScaling * wordDefMaxSpaceWidth; + } else { + avgWidth = 0; + n = 0; + for (i = 0; i < 256; ++i) { + w = ((Gfx8BitFont *)gfxFont)->getWidth(i); + if (w > 0) { + avgWidth += w; + ++n; + } + } + avgWidth /= n; + minSpaceWidth = horizScaling * wordMinSpaceWidth * avgWidth; + maxSpaceWidth = horizScaling * wordMaxSpaceWidth * avgWidth; + } - TextBlock *blocks; - TextLine *next; - double yMin, yMax; -}; +} -TextLine::TextLine() { - blocks = NULL; - next = NULL; +TextFontInfo::~TextFontInfo() { } -TextLine::~TextLine() { - TextBlock *p1, *p2; +GBool TextFontInfo::matches(GfxState *state) { + double *textMat; + double t1, t2, h; - for (p1 = blocks; p1; p1 = p2) { - p2 = p1->next; - delete p1; + textMat = state->getTextMat(); + h = state->getHorizScaling(); + if ((t1 = fabs(textMat[0])) > 0.01 && + (t2 = fabs(textMat[3])) > 0.01) { + h *= t1 / t2; } + return state->getFont() == gfxFont && + fabs(h - horizScaling) < 0.01; } //------------------------------------------------------------------------ -// TextString +// TextWord //------------------------------------------------------------------------ -TextString::TextString(GfxState *state, double x0, double y0, - double fontSize) { - GfxFont *font; +TextWord::TextWord(GfxState *state, double x0, double y0, + TextFontInfo *fontA, double fontSizeA) { + GfxFont *gfxFont; double x, y; + font = fontA; + fontSize = fontSizeA; state->transform(x0, y0, &x, &y); - if ((font = state->getFont())) { - yMin = y - font->getAscent() * fontSize; - yMax = y - font->getDescent() * fontSize; + if ((gfxFont = font->gfxFont)) { + yMin = y - gfxFont->getAscent() * fontSize; + yMax = y - gfxFont->getDescent() * fontSize; } else { // this means that the PDF file draws text without a current font, // which should never happen @@ -141,21 +192,23 @@ TextString::TextString(GfxState *state, double x0, double y0, yMin = y; yMax = y + 1; } - marked = gFalse; + yBase = y; text = NULL; xRight = NULL; len = size = 0; + spaceAfter = gFalse; next = NULL; + } -TextString::~TextString() { +TextWord::~TextWord() { gfree(text); gfree(xRight); } -void TextString::addChar(GfxState *state, double x, double y, - double dx, double dy, Unicode u) { +void TextWord::addChar(GfxState *state, double x, double y, + double dx, double dy, Unicode u) { if (len == size) { size += 16; text = (Unicode *)grealloc(text, size * sizeof(Unicode)); @@ -169,35 +222,249 @@ void TextString::addChar(GfxState *state, double x, double y, ++len; } +// Returns true if comes before in xy order. +GBool TextWord::xyBefore(TextWord *word2) { + return xMin < word2->xMin || + (xMin == word2->xMin && yMin < word2->yMin); +} + +// Merge another word onto the end of this one. +void TextWord::merge(TextWord *word2) { + int i; + + xMax = word2->xMax; + if (word2->yMin < yMin) { + yMin = word2->yMin; + } + if (word2->yMax > yMax) { + yMax = word2->yMax; + } + if (len + word2->len > size) { + size = len + word2->len; + text = (Unicode *)grealloc(text, size * sizeof(Unicode)); + xRight = (double *)grealloc(xRight, size * sizeof(double)); + } + for (i = 0; i < word2->len; ++i) { + text[len + i] = word2->text[i]; + xRight[len + i] = word2->xRight[i]; + } + len += word2->len; +} + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +TextLine::TextLine() { + words = NULL; + text = NULL; + xRight = NULL; + col = NULL; + len = 0; + hyphenated = gFalse; + pageNext = NULL; + next = NULL; + flowNext = NULL; +} + +TextLine::~TextLine() { + TextWord *w1, *w2; + + for (w1 = words; w1; w1 = w2) { + w2 = w1->next; + delete w1; + } + gfree(text); + gfree(xRight); + gfree(col); +} + +// Returns true if comes before in yx order, allowing +// slack for vertically overlapping lines. +GBool TextLine::yxBefore(TextLine *line2) { + double dy; + + dy = lineOverlapSlack * fontSize; + + // non-overlapping case + if (line2->yMin > yMax - dy || + line2->yMax < yMin + dy) { + return yMin < line2->yMin || + (yMin == line2->yMin && xMin < line2->xMin); + } + + // overlapping case + return xMin < line2->xMin; +} + +// Merge another line's words onto the end of this line. +void TextLine::merge(TextLine *line2) { + TextWord *word; + int newLen, i; + + xMax = line2->xMax; + if (line2->yMin < yMin) { + yMin = line2->yMin; + } + if (line2->yMax > yMax) { + yMax = line2->yMax; + } + xSpaceR = line2->xSpaceR; + for (word = words; word->next; word = word->next) ; + word->spaceAfter = gTrue; + word->next = line2->words; + line2->words = NULL; + newLen = len + 1 + line2->len; + text = (Unicode *)grealloc(text, newLen * sizeof(Unicode)); + xRight = (double *)grealloc(xRight, newLen * sizeof(double)); + text[len] = (Unicode)0x0020; + xRight[len] = line2->xMin; + for (i = 0; i < line2->len; ++i) { + text[len + 1 + i] = line2->text[i]; + xRight[len + 1 + i] = line2->xRight[i]; + } + len = newLen; + convertedLen += line2->convertedLen; + hyphenated = line2->hyphenated; +} + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +TextBlock::TextBlock() { + lines = NULL; + next = NULL; +} + +TextBlock::~TextBlock() { + TextLine *l1, *l2; + + for (l1 = lines; l1; l1 = l2) { + l2 = l1->next; + delete l1; + } +} + +// Returns true if comes before in xy order, allowing +// slack for vertically overlapping blocks. +GBool TextBlock::yxBefore(TextBlock *blk2) { + double dy; + + dy = blkOverlapSlack * lines->fontSize; + + // non-overlapping case + if (blk2->yMin > yMax - dy || + blk2->yMax < yMin + dy) { + return yMin < blk2->yMin || + (yMin == blk2->yMin && xMin < blk2->xMin); + } + + // overlapping case + return xMin < blk2->xMin; +} + +// Merge another block's line onto the right of this one. +void TextBlock::mergeRight(TextBlock *blk2) { + lines->merge(blk2->lines); + xMax = lines->xMax; + yMin = lines->yMin; + yMax = lines->yMax; + xSpaceR = lines->xSpaceR; +} + +// Merge another block's lines onto the bottom of this block. +void TextBlock::mergeBelow(TextBlock *blk2) { + TextLine *line; + + if (blk2->xMin < xMin) { + xMin = blk2->xMin; + } + if (blk2->xMax > xMax) { + xMax = blk2->xMax; + } + yMax = blk2->yMax; + if (blk2->xSpaceL > xSpaceL) { + xSpaceL = blk2->xSpaceL; + } + if (blk2->xSpaceR < xSpaceR) { + xSpaceR = blk2->xSpaceR; + } + if (blk2->maxFontSize > maxFontSize) { + maxFontSize = blk2->maxFontSize; + } + for (line = lines; line->next; line = line->next) ; + line->next = line->flowNext = blk2->lines; + blk2->lines = NULL; +} + +//------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +TextFlow::TextFlow() { + blocks = NULL; + next = NULL; +} + +TextFlow::~TextFlow() { + TextBlock *b1, *b2; + + for (b1 = blocks; b1; b1 = b2) { + b2 = b1->next; + delete b1; + } +} + + //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ TextPage::TextPage(GBool rawOrderA) { rawOrder = rawOrderA; - curStr = NULL; + curWord = NULL; + font = NULL; fontSize = 0; - xyStrings = NULL; - xyCur1 = xyCur2 = NULL; - lines = NULL; nest = 0; nTinyChars = 0; + words = wordPtr = NULL; + lines = NULL; + flows = NULL; + fonts = new GList(); } TextPage::~TextPage() { clear(); + delete fonts; } void TextPage::updateFont(GfxState *state) { - GfxFont *font; + GfxFont *gfxFont; double *fm; char *name; int code, mCode, letterCode, anyCode; double w; + int i; + + // get the font info object + font = NULL; + for (i = 0; i < fonts->getLength(); ++i) { + font = (TextFontInfo *)fonts->get(i); + if (font->matches(state)) { + break; + } + font = NULL; + } + if (!font) { + font = new TextFontInfo(state); + fonts->append(font); + } // adjust the font size + gfxFont = state->getFont(); fontSize = state->getTransformedFontSize(); - if ((font = state->getFont()) && font->getType() == fontType3) { + if (gfxFont && gfxFont->getType() == fontType3) { // This is a hack which makes it possible to deal with some Type 3 // fonts. The problem is that it's impossible to know what the // base coordinate system used in the font is without actually @@ -206,7 +473,7 @@ void TextPage::updateFont(GfxState *state) { // subset that doesn't contain 'm'). mCode = letterCode = anyCode = -1; for (code = 0; code < 256; ++code) { - name = ((Gfx8BitFont *)font)->getCharName(code); + name = ((Gfx8BitFont *)gfxFont)->getCharName(code); if (name && name[0] == 'm' && name[1] == '\0') { mCode = code; } @@ -215,647 +482,1202 @@ void TextPage::updateFont(GfxState *state) { (name[0] >= 'a' && name[0] <= 'z'))) { letterCode = code; } - if (anyCode < 0 && name && ((Gfx8BitFont *)font)->getWidth(code) > 0) { + if (anyCode < 0 && name && + ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) { anyCode = code; } } if (mCode >= 0 && - (w = ((Gfx8BitFont *)font)->getWidth(mCode)) > 0) { + (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) { // 0.6 is a generic average 'm' width -- yes, this is a hack fontSize *= w / 0.6; } else if (letterCode >= 0 && - (w = ((Gfx8BitFont *)font)->getWidth(letterCode)) > 0) { + (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) { // even more of a hack: 0.5 is a generic letter width fontSize *= w / 0.5; } else if (anyCode >= 0 && - (w = ((Gfx8BitFont *)font)->getWidth(anyCode)) > 0) { + (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) { // better than nothing: 0.5 is a generic character width fontSize *= w / 0.5; } - fm = font->getFontMatrix(); + fm = gfxFont->getFontMatrix(); if (fm[0] != 0) { fontSize *= fabs(fm[3] / fm[0]); } } } -void TextPage::beginString(GfxState *state, double x0, double y0) { +void TextPage::beginWord(GfxState *state, double x0, double y0) { // This check is needed because Type 3 characters can contain - // text-drawing operations. - if (curStr) { + // text-drawing operations (when TextPage is being used via + // XOutputDev rather than TextOutputDev). + if (curWord) { ++nest; return; } - curStr = new TextString(state, x0, y0, fontSize); + curWord = new TextWord(state, x0, y0, font, fontSize); } void TextPage::addChar(GfxState *state, double x, double y, - double dx, double dy, Unicode *u, int uLen) { - double x1, y1, w1, h1, dx2, dy2; + double dx, double dy, + CharCode c, Unicode *u, int uLen) { + double x1, y1, w1, h1, dx2, dy2, sp; int n, i; + // if the previous char was a space, addChar will have called + // endWord, so we need to start a new word + if (!curWord) { + beginWord(state, x, y); + } + + // throw away chars that aren't inside the page bounds state->transform(x, y, &x1, &y1); - if (x1 < 0 || x1 > state->getPageWidth() || - y1 < 0 || y1 > state->getPageHeight()) { + if (x1 < 0 || x1 > pageWidth || + y1 < 0 || y1 > pageHeight) { return; } - state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(), - 0, &dx2, &dy2); + + // subtract char and word spacing from the dx,dy values + sp = state->getCharSpace(); + if (c == (CharCode)0x20) { + sp += state->getWordSpace(); + } + state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2); dx -= dx2; dy -= dy2; state->transformDelta(dx, dy, &w1, &h1); + + // check the tiny chars limit if (!globalParams->getTextKeepTinyChars() && fabs(w1) < 3 && fabs(h1) < 3) { if (++nTinyChars > 20000) { return; } } - n = curStr->len; - if (n > 0 && x1 - curStr->xRight[n-1] > - 0.1 * (curStr->yMax - curStr->yMin)) { - // large char spacing is sometimes used to move text around - endString(); - beginString(state, x, y); - } - if (uLen == 1 && u[0] == (Unicode)0x20 && - w1 > 0.5 * (curStr->yMax - curStr->yMin)) { - // large word spacing is sometimes used to move text around + + // break words at space character + if (uLen == 1 && u[0] == (Unicode)0x20) { + endWord(); return; } + + // large char spacing is sometimes used to move text around -- in + // this case, break text into individual chars and let the coalesce + // function deal with it later + n = curWord->len; + if (n > 0 && x1 - curWord->xRight[n-1] > + curWord->font->minSpaceWidth * curWord->fontSize) { + // large char spacing is sometimes used to move text around + endWord(); + beginWord(state, x, y); + } + + // add the characters to the current word if (uLen != 0) { w1 /= uLen; h1 /= uLen; } for (i = 0; i < uLen; ++i) { - curStr->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); + curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]); } } -void TextPage::endString() { +void TextPage::endWord() { // This check is needed because Type 3 characters can contain - // text-drawing operations. + // text-drawing operations (when TextPage is being used via + // XOutputDev rather than TextOutputDev). if (nest > 0) { --nest; return; } - addString(curStr); - curStr = NULL; + if (curWord) { + addWord(curWord); + curWord = NULL; + } } -void TextPage::addString(TextString *str) { - TextString *p1, *p2; +void TextPage::addWord(TextWord *word) { + TextWord *p1, *p2; - // throw away zero-length strings -- they don't have valid xMin/xMax + // throw away zero-length words -- they don't have valid xMin/xMax // values, and they're useless anyway - if (str->len == 0) { - delete str; + if (word->len == 0) { + delete word; return; } - // insert string in xy list + // insert word in xy list if (rawOrder) { - p1 = xyCur1; + p1 = wordPtr; p2 = NULL; - } else if ((!xyCur1 || xyBefore(xyCur1, str)) && - (!xyCur2 || xyBefore(str, xyCur2))) { - p1 = xyCur1; - p2 = xyCur2; - } else if (xyCur1 && xyBefore(xyCur1, str)) { - for (p1 = xyCur1, p2 = xyCur2; p2; p1 = p2, p2 = p2->next) { - if (xyBefore(str, p2)) { - break; - } - } - xyCur2 = p2; } else { - for (p1 = NULL, p2 = xyStrings; p2; p1 = p2, p2 = p2->next) { - if (xyBefore(str, p2)) { + if (wordPtr && wordPtr->xyBefore(word)) { + p1 = wordPtr; + p2 = wordPtr->next; + } else { + p1 = NULL; + p2 = words; + } + for (; p2; p1 = p2, p2 = p2->next) { + if (word->xyBefore(p2)) { break; } } - xyCur2 = p2; } - xyCur1 = str; if (p1) { - p1->next = str; + p1->next = word; } else { - xyStrings = str; + words = word; } - str->next = p2; + word->next = p2; + wordPtr = word; } void TextPage::coalesce() { - TextLine *line, *line0; - TextBlock *yxBlocks, *xyBlocks, *blk, *blk0, *blk1, *blk2; - TextString *str0, *str1, *str2, *str3, *str4; - TextString *str1prev, *str2prev, *str3prev; - TextOutColumnEdge *edges; + TextWord *word0, *word1, *word2, *word3, *word4; + TextLine *line0, *line1, *line2, *line3, *line4, *lineList; + TextBlock *blk0, *blk1, *blk2, *blk3, *blk4, *blk5, *blk6; + TextBlock *yxBlocks, *blocks, *blkStack; + TextFlow *flow0, *flow1; + double sz, xLimit, minSpace, maxSpace, yLimit; + double fit1, fit2; + GBool found; UnicodeMap *uMap; GBool isUnicode; char buf[8]; - int edgesLength, edgesSize; - double x, yMin, yMax; - double space, fit1, fit2, h; - int col1, col2, d; - int i, j; - -#if 0 //~ for debugging - for (str1 = xyStrings; str1; str1 = str1->next) { - printf("x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", - str1->xMin, str1->xMax, str1->yMin, str1->yMax, - (str1->yMax - str1->yMin)); - for (i = 0; i < str1->len; ++i) { - fputc(str1->text[i] & 0xff, stdout); + int col1, col2, d, i, j; + +#if 0 // for debugging + printf("*** initial word list ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } - printf("\n------------------------------------------------------------\n\n"); + printf("\n"); + fflush(stdout); #endif - // build the list of column edges - edges = NULL; - edgesLength = edgesSize = 0; - if (!rawOrder) { - for (str1prev = NULL, str1 = xyStrings; - str1; - str1prev = str1, str1 = str1->next) { - if (str1->marked) { - continue; - } - h = str1->yMax - str1->yMin; - if (str1prev && (str1->xMin - str1prev->xMax) / h < textOutColSpace) { - continue; - } - x = str1->xMin; - yMin = str1->yMin; - yMax = str1->yMax; - for (str2prev = str1, str2 = str1->next; - str2; - str2prev = str2, str2 = str2->next) { - h = str2->yMax - str2->yMin; - if (!str2->marked && - (str2->xMin - str2prev->xMax) / h > textOutColSpace && - fabs(str2->xMin - x) < 0.5 && - str2->yMin - yMax < 0.3 * h && - yMin - str2->yMax < 0.3 * h) { - break; - } - } - if (str2) { - if (str2->yMin < yMin) { - yMin = str2->yMin; - } - if (str2->yMax > yMax) { - yMax = str2->yMax; - } - str2->marked = gTrue; - for (str3prev = str1, str3 = str1->next; - str3; - str3prev = str3, str3 = str3->next) { - h = str3->yMax - str3->yMin; - if (!str3->marked && - (str3->xMin - str3prev->xMax) / h > textOutColSpace && - fabs(str3->xMin - x) < 0.5 && - str3->yMin - yMax < 0.3 * h && - yMin - str3->yMax < 0.3 * h) { - break; - } - } - if (str3) { - if (str3->yMin < yMin) { - yMin = str3->yMin; - } - if (str3->yMax > yMax) { - yMax = str3->yMax; - } - str3->marked = gTrue; - do { - for (str2prev = str1, str2 = str1->next; - str2; - str2prev = str2, str2 = str2->next) { - h = str2->yMax - str2->yMin; - if (!str2->marked && - (str2->xMin - str2prev->xMax) / h > textOutColSpace && - fabs(str2->xMin - x) < 0.5 && - str2->yMin - yMax < 0.3 * h && - yMin - str2->yMax < 0.3 * h) { - if (str2->yMin < yMin) { - yMin = str2->yMin; - } - if (str2->yMax > yMax) { - yMax = str2->yMax; - } - str2->marked = gTrue; - break; - } - } - } while (str2); - if (edgesLength == edgesSize) { - edgesSize = edgesSize ? 2 * edgesSize : 16; - edges = (TextOutColumnEdge *) - grealloc(edges, edgesSize * sizeof(TextOutColumnEdge)); - } - edges[edgesLength].x = x; - edges[edgesLength].y0 = yMin; - edges[edgesLength].y1 = yMax; - ++edgesLength; - } else { - str2->marked = gFalse; - } + //----- discard duplicated text (fake boldface, drop shadows) + + word0 = words; + while (word0) { + sz = word0->fontSize; + xLimit = word0->xMin + sz * dupMaxDeltaX; + found = gFalse; + for (word1 = word0, word2 = word0->next; + word2 && word2->xMin < xLimit; + word1 = word2, word2 = word2->next) { + if (word2->len == word0->len && + !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode)) && + fabs(word2->yMin - word0->yMin) < sz * dupMaxDeltaY && + fabs(word2->yMax - word0->yMax) < sz * dupMaxDeltaY && + fabs(word2->xMax - word0->xMax) < sz * dupMaxDeltaX) { + found = gTrue; + break; } - str1->marked = gTrue; + } + if (found) { + word1->next = word2->next; + delete word2; + } else { + word0 = word0->next; } } -#if 0 //~ for debugging - printf("column edges:\n"); - for (i = 0; i < edgesLength; ++i) { - printf("%d: x=%.2f y0=%.2f y1=%.2f\n", - i, edges[i].x, edges[i].y0, edges[i].y1); +#if 0 // for debugging + printf("*** words after removing duplicate text ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); } - printf("\n------------------------------------------------------------\n\n"); + printf("\n"); + fflush(stdout); #endif - // build the blocks - yxBlocks = NULL; - blk1 = blk2 = NULL; - while (xyStrings) { - - // build the block - str0 = xyStrings; - xyStrings = xyStrings->next; - str0->next = NULL; - blk = new TextBlock(); - blk->strings = str0; - blk->xMin = str0->xMin; - blk->xMax = str0->xMax; - blk->yMin = str0->yMin; - blk->yMax = str0->yMax; - while (xyStrings) { - str1 = NULL; - str2 = xyStrings; - fit1 = coalesceFit(str0, str2); - if (!rawOrder) { - // look for best-fitting string - space = str0->yMax - str0->yMin; - for (str3 = xyStrings, str4 = xyStrings->next; - str4 && str4->xMin - str0->xMax <= space; - str3 = str4, str4 = str4->next) { - fit2 = coalesceFit(str0, str4); - if (fit2 < fit1) { - str1 = str3; - str2 = str4; - fit1 = fit2; - } - } - } - if (fit1 > 1) { - // no fit - we're done with this block - break; - } - - // if we've hit a column edge we're done with this block - if (fit1 > 0.2) { - for (i = 0; i < edgesLength; ++i) { - if (str0->xMax < edges[i].x + 0.5 && edges[i].x - 0.5 < str2->xMin && - str0->yMin < edges[i].y1 && str0->yMax > edges[i].y0 && - str2->yMin < edges[i].y1 && str2->yMax > edges[i].y0) { - break; - } - } - if (i < edgesLength) { + //----- merge words + + word0 = words; + while (word0) { + sz = word0->fontSize; + + // look for adjacent text which is part of the same word, and + // merge it into this word + xLimit = word0->xMax + sz * word0->font->minSpaceWidth; + if (rawOrder) { + word1 = word0; + word2 = word0->next; + found = word2 && + word2->xMin < xLimit && + word2->font == word0->font && + fabs(word2->fontSize - sz) < 0.05 && + fabs(word2->yBase - word0->yBase) < 0.05; + } else { + found = gFalse; + for (word1 = word0, word2 = word0->next; + word2 && word2->xMin < xLimit; + word1 = word2, word2 = word2->next) { + if (word2->font == word0->font && + fabs(word2->fontSize - sz) < 0.05 && + fabs(word2->yBase - word0->yBase) < 0.05) { + found = gTrue; break; } } - - if (str1) { - str1->next = str2->next; - } else { - xyStrings = str2->next; - } - str0->next = str2; - str2->next = NULL; - if (str2->xMax > blk->xMax) { - blk->xMax = str2->xMax; - } - if (str2->yMin < blk->yMin) { - blk->yMin = str2->yMin; - } - if (str2->yMax > blk->yMax) { - blk->yMax = str2->yMax; - } - str0 = str2; - } - - // insert block on list - if (!rawOrder) { - // insert block on list in yx order - for (blk1 = NULL, blk2 = yxBlocks; - blk2 && !yxBefore(blk, blk2); - blk1 = blk2, blk2 = blk2->next) ; } - blk->next = blk2; - if (blk1) { - blk1->next = blk; - } else { - yxBlocks = blk; + if (found) { + word0->merge(word2); + word1->next = word2->next; + delete word2; + continue; } - blk1 = blk; + + word0 = word0->next; } - gfree(edges); +#if 0 // for debugging + printf("*** after merging words ***\n"); + for (word0 = words; word0; word0 = word0->next) { + printf("word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, word0->yBase); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + printf("\n"); + fflush(stdout); +#endif - // the strings are now owned by the lines/blocks tree - xyStrings = NULL; + //----- assemble words into lines - // build the block text uMap = globalParams->getTextEncoding(); isUnicode = uMap ? uMap->isUnicode() : gFalse; - for (blk = yxBlocks; blk; blk = blk->next) { - blk->len = 0; - for (str1 = blk->strings; str1; str1 = str1->next) { - blk->len += str1->len; - if (str1->next && str1->next->xMin - str1->xMax > - textOutSpace * (str1->yMax - str1->yMin)) { - str1->spaceAfter = gTrue; - ++blk->len; + + lineList = NULL; + line0 = NULL; + while (words) { + + // build a new line object + word0 = words; + words = words->next; + word0->next = NULL; + line1 = new TextLine(); + line1->words = word0; + line1->xMin = word0->xMin; + line1->xMax = word0->xMax; + line1->yMin = word0->yMin; + line1->yMax = word0->yMax; + line1->yBase = word0->yBase; + line1->font = word0->font; + line1->fontSize = word0->fontSize; + line1->len = word0->len; + minSpace = line1->fontSize * word0->font->minSpaceWidth; + maxSpace = line1->fontSize * word0->font->maxSpaceWidth; + + // find subsequent words in the line + while (words) { + xLimit = line1->xMax + maxSpace; + fit1 = fit2 = 0; + word3 = word4 = NULL; + if (rawOrder) { + if (words && + words->xMin < xLimit && + ((fit1 = lineFit(line1, word0, words)) >= 0)) { + word3 = NULL; + word4 = words; + } + } else { + for (word1 = NULL, word2 = words; + word2 && word2->xMin < xLimit; + word1 = word2, word2 = word2->next) { + fit2 = lineFit(line1, word0, word2); + if (fit2 >= 0 && (!word4 || + (word4 && fit2 < fit1))) { + fit1 = fit2; + word3 = word1; + word4 = word2; + } + } + } + if (word4) { + if (word3) { + word3->next = word4->next; + } else { + words = word4->next; + } + word0->next = word4; + word4->next = NULL; + if (word4->xMax > line1->xMax) { + line1->xMax = word4->xMax; + } + if (word4->yMin < line1->yMin) { + line1->yMin = word4->yMin; + } + if (word4->yMax > line1->yMax) { + line1->yMax = word4->yMax; + } + line1->len += word4->len; + if (fit1 > minSpace) { + word0->spaceAfter = gTrue; + ++line1->len; + } + word0 = word4; } else { - str1->spaceAfter = gFalse; + break; } } - blk->text = (Unicode *)gmalloc(blk->len * sizeof(Unicode)); - blk->xRight = (double *)gmalloc(blk->len * sizeof(double)); - blk->col = (int *)gmalloc(blk->len * sizeof(int)); + + // build the line text + line1->text = (Unicode *)gmalloc(line1->len * sizeof(Unicode)); + line1->xRight = (double *)gmalloc(line1->len * sizeof(double)); + line1->col = (int *)gmalloc(line1->len * sizeof(int)); i = 0; - for (str1 = blk->strings; str1; str1 = str1->next) { - for (j = 0; j < str1->len; ++j) { - blk->text[i] = str1->text[j]; - blk->xRight[i] = str1->xRight[j]; + for (word1 = line1->words; word1; word1 = word1->next) { + for (j = 0; j < word1->len; ++j) { + line1->text[i] = word1->text[j]; + line1->xRight[i] = word1->xRight[j]; ++i; } - if (str1->spaceAfter) { - blk->text[i] = (Unicode)0x0020; - blk->xRight[i] = str1->next->xMin; + if (word1->spaceAfter && word1->next) { + line1->text[i] = (Unicode)0x0020; + line1->xRight[i] = word1->next->xMin; ++i; } } - blk->convertedLen = 0; - for (j = 0; j < blk->len; ++j) { - blk->col[j] = blk->convertedLen; + line1->convertedLen = 0; + for (j = 0; j < line1->len; ++j) { + line1->col[j] = line1->convertedLen; if (isUnicode) { - ++blk->convertedLen; + ++line1->convertedLen; } else if (uMap) { - blk->convertedLen += uMap->mapUnicode(blk->text[j], buf, sizeof(buf)); + line1->convertedLen += + uMap->mapUnicode(line1->text[j], buf, sizeof(buf)); } } + + // check for hyphen at end of line + //~ need to check for other chars used as hyphens + if (line1->text[line1->len - 1] == (Unicode)'-') { + line1->hyphenated = gTrue; + } + + // insert line on list + if (line0) { + line0->next = line1; + } else { + lineList = line1; + } + line0 = line1; } + if (uMap) { uMap->decRefCnt(); } -#if 0 //~ for debugging - for (blk = yxBlocks; blk; blk = blk->next) { - printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n", - blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len); - TextString *str; - for (str = blk->strings; str; str = str->next) { - printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f'", - str->xMin, str->xMax, str->yMin, str->yMax, - (str->yMax - str->yMin)); - for (i = 0; i < str->len; ++i) { - fputc(str->text[i] & 0xff, stdout); - } - if (str->spaceAfter) { - fputc(' ', stdout); +#if 0 // for debugging + printf("*** lines in xy order ***\n"); + for (line0 = lineList; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); } printf("'\n"); } } - printf("\n------------------------------------------------------------\n\n"); + printf("\n"); + fflush(stdout); #endif - // build the lines - lines = NULL; - line0 = NULL; - while (yxBlocks) { - blk0 = yxBlocks; - yxBlocks = yxBlocks->next; - blk0->next = NULL; - line = new TextLine(); - line->blocks = blk0; - line->yMin = blk0->yMin; - line->yMax = blk0->yMax; - while (yxBlocks) { + //----- column assignment - // remove duplicated text (fake boldface, shadowed text) - h = blk0->yMax - blk0->yMin; - if (yxBlocks->len == blk0->len && - !memcmp(yxBlocks->text, blk0->text, - yxBlocks->len * sizeof(Unicode)) && - fabs(yxBlocks->yMin - blk0->yMin) / h < 0.2 && - fabs(yxBlocks->yMax - blk0->yMax) / h < 0.2 && - fabs(yxBlocks->xMin - blk0->xMin) / h < 0.2 && - fabs(yxBlocks->xMax - blk0->xMax) / h < 0.2) { - blk1 = yxBlocks; - yxBlocks = yxBlocks->next; - delete blk1; - continue; + for (line1 = lineList; line1; line1 = line1->next) { + col1 = 0; + for (line2 = lineList; line2 != line1; line2 = line2->next) { + if (line1->xMin >= line2->xMax) { + d = (int)((line1->xMin - line2->xMax) / + (line1->font->maxSpaceWidth * line1->fontSize)); + if (d > 4) { + d = 4; + } + col2 = line2->col[0] + line2->convertedLen + d; + if (col2 > col1) { + col1 = col2; + } + } else if (line1->xMin > line2->xMin) { + for (i = 0; i < line2->len && line1->xMin >= line2->xRight[i]; ++i) ; + col2 = line2->col[i]; + if (col2 > col1) { + col1 = col2; + } } + } + for (j = 0; j < line1->len; ++j) { + line1->col[j] += col1; + } + } - if (rawOrder && yxBlocks->yMax < blk0->yMin) { - break; + //----- assemble lines into blocks + + if (rawOrder) { + + lines = lineList; + for (line1 = lines; line1; line1 = line1->next) { + line1->xSpaceL = 0; + line1->xSpaceR = pageWidth; + } + + } else { + + // sort lines into yx order + lines = NULL; + while (lineList) { + line0 = lineList; + lineList = lineList->next; + for (line1 = NULL, line2 = lines; + line2 && !line0->yxBefore(line2); + line1 = line2, line2 = line2->next) ; + if (line1) { + line1->next = line0; + } else { + lines = line0; } - if (yxBlocks->yMin > 0.2*blk0->yMin + 0.8*blk0->yMax || - yxBlocks->xMin < blk0->xMax) { - break; + line0->next = line2; + } + + // compute whitespace to left and right of each line + line0 = lines; + for (line1 = lines; line1; line1 = line1->next) { + + // find the first vertically overlapping line + for (; line0 && line0->yMax < line1->yMin; line0 = line0->next) ; + + // check each vertically overlapping line -- look for the nearest + // on each side + line1->xSpaceL = 0; + line1->xSpaceR = pageWidth; + for (line2 = line0; + line2 && line2->yMin < line1->yMax; + line2 = line2->next) { + if (line2->yMax > line1->yMin) { + if (line2->xMax < line1->xMin) { + if (line2->xMax > line1->xSpaceL) { + line1->xSpaceL = line2->xMax; + } + } else if (line2->xMin > line1->xMax) { + if (line2->xMin < line1->xSpaceR) { + line1->xSpaceR = line2->xMin; + } + } + } } - blk1 = yxBlocks; - yxBlocks = yxBlocks->next; - blk0->next = blk1; - blk1->next = NULL; - if (blk1->yMin < line->yMin) { - line->yMin = blk1->yMin; + } + } // (!rawOrder) + +#if 0 // for debugging + printf("*** lines in yx order ***\n"); + for (line0 = lines; line0; line0 = line0->next) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSz=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->fontSize, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); } - if (blk1->yMax > line->yMax) { - line->yMax = blk1->yMax; + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); +#endif + + lineList = lines; + yxBlocks = NULL; + blk0 = NULL; + while (lineList) { + + // build a new block object + line0 = lineList; + lineList = lineList->next; + line0->next = NULL; + blk1 = new TextBlock(); + blk1->lines = line0; + blk1->xMin = line0->xMin; + blk1->xMax = line0->xMax; + blk1->yMin = line0->yMin; + blk1->yMax = line0->yMax; + blk1->xSpaceL = line0->xSpaceL; + blk1->xSpaceR = line0->xSpaceR; + blk1->maxFontSize = line0->fontSize; + + // find subsequent lines in the block + while (lineList) { + + // look for the first horizontally overlapping line below this + // one + yLimit = line0->yMax + blkMaxSpacing * line0->fontSize; + line3 = line4 = NULL; + if (rawOrder) { + if (lineList->yMin < yLimit && + lineList->xMax > blk1->xMin && + lineList->xMin < blk1->xMax) { + line3 = NULL; + line4 = lineList; + } + } else { + for (line1 = NULL, line2 = lineList; + line2 && line2->yMin < yLimit; + line1 = line2, line2 = line2->next) { + if (line2->xMax > blk1->xMin && + line2->xMin < blk1->xMax) { + line3 = line1; + line4 = line2; + break; + } + } + } + + // if there is an overlapping line and it fits in the block, add + // it to the block + if (line4 && blockFit(blk1, line4)) { + if (line3) { + line3->next = line4->next; + } else { + lineList = line4->next; + } + line0->next = line0->flowNext = line4; + line4->next = NULL; + if (line4->xMin < blk1->xMin) { + blk1->xMin = line4->xMin; + } else if (line4->xMax > blk1->xMax) { + blk1->xMax = line4->xMax; + } + if (line4->yMax > blk1->yMax) { + blk1->yMax = line4->yMax; + } + if (line4->xSpaceL > blk1->xSpaceL) { + blk1->xSpaceL = line4->xSpaceL; + } + if (line4->xSpaceR < blk1->xSpaceR) { + blk1->xSpaceR = line4->xSpaceR; + } + if (line4->fontSize > blk1->maxFontSize) { + blk1->maxFontSize = line4->fontSize; + } + line0 = line4; + + // otherwise, we're done with this block + } else { + break; } + } + + // insert block on list, in yx order + if (rawOrder) { + blk2 = blk0; + blk3 = NULL; blk0 = blk1; + } else { + for (blk2 = NULL, blk3 = yxBlocks; + blk3 && !blk1->yxBefore(blk3); + blk2 = blk3, blk3 = blk3->next) ; } - if (line0) { - line0->next = line; + blk1->next = blk3; + if (blk2) { + blk2->next = blk1; } else { - lines = line; + yxBlocks = blk1; } - line->next = NULL; - line0 = line; } - - // sort the blocks into xy order - xyBlocks = NULL; - for (line = lines; line; line = line->next) { - for (blk = line->blocks; blk; blk = blk->next) { - for (blk1 = NULL, blk2 = xyBlocks; - blk2 && !xyBefore(blk, blk2); - blk1 = blk2, blk2 = blk2->xyNext) ; - blk->xyNext = blk2; - if (blk1) { - blk1->xyNext = blk; - } else { - xyBlocks = blk; +#if 0 // for debugging + printf("*** blocks in yx order ***\n"); + for (blk0 = yxBlocks; blk0; blk0 = blk0->next) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); } } } + printf("\n"); + fflush(stdout); +#endif + + //----- merge lines and blocks, sort blocks into reading order + + if (rawOrder) { + blocks = yxBlocks; + + } else { + blocks = NULL; + blk0 = NULL; + blkStack = NULL; + while (yxBlocks) { -#if 0 //~ for debugging - for (blk = xyBlocks; blk; blk = blk->xyNext) { - printf("[block: x=%.2f..%.2f y=%.2f..%.2f len=%d]\n", - blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->len); - TextString *str; - for (str = blk->strings; str; str = str->next) { - printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", - str->xMin, str->xMax, str->yMin, str->yMax, - (str->yMax - str->yMin)); - for (i = 0; i < str->len; ++i) { - fputc(str->text[i] & 0xff, stdout); + // find the next two blocks: + // - if the depth-first traversal stack is empty, take the first + // (upper-left-most) two blocks on the yx-sorted block list + // - otherwise, find the two upper-left-most blocks under the top + // block on the stack + if (blkStack) { + blk3 = blk4 = blk5 = blk6 = NULL; + for (blk1 = NULL, blk2 = yxBlocks; + blk2; + blk1 = blk2, blk2 = blk2->next) { + if (blk2->yMin > blkStack->yMin && + blk2->xMax > blkStack->xMin && + blk2->xMin < blkStack->xMax) { + if (!blk4 || blk2->yxBefore(blk4)) { + blk5 = blk3; + blk6 = blk4; + blk3 = blk1; + blk4 = blk2; + } else if (!blk6 || blk2->yxBefore(blk6)) { + blk5 = blk1; + blk6 = blk2; + } + } + } + } else { + blk3 = NULL; + blk4 = yxBlocks; + blk5 = yxBlocks; + blk6 = yxBlocks->next; + } + + // merge case 1: + // | | | + // | blkStack | | blkStack + // +---------------------+ --> +-------------- + // +------+ +------+ +-----------+ + // | blk4 | | blk6 | ... | blk4+blk6 | + // +------+ +------+ +-----------+ + if (blkStack) { + yLimit = blkStack->yMax + blkMaxSpacing * blkStack->lines->fontSize; + } + if (blkStack && blk4 && blk6 && + !blk4->lines->next && !blk6->lines->next && + lineFit2(blk4->lines, blk6->lines) && + blk4->yMin < yLimit && + blk4->xMin > blkStack->xSpaceL && + blkStack->xMin > blk4->xSpaceL && + blk6->xMax < blkStack->xSpaceR) { + blk4->mergeRight(blk6); + if (blk5) { + blk5->next = blk6->next; + } else { + yxBlocks = blk6->next; + } + delete blk6; + + // merge case 2: + // | | | | + // | blkStack | | | + // +---------------------+ --> | blkStack+blk2 | + // +---------------------+ | | + // | blk4 | | | + // | | | | + } else if (blkStack && blk4 && + blk4->yMin < yLimit && + blockFit2(blkStack, blk4)) { + blkStack->mergeBelow(blk4); + if (blk3) { + blk3->next = blk4->next; + } else { + yxBlocks = blk4->next; + } + delete blk4; + + // if any of: + // 1. no block found + // 2. non-fully overlapping block found + // 3. large vertical gap above the overlapping block + // then pop the stack and try again + } else if (!blk4 || + (blkStack && (blk4->xMin < blkStack->xSpaceL || + blk4->xMax > blkStack->xSpaceR || + blk4->yMin - blkStack->yMax > + blkMaxSortSpacing * blkStack->maxFontSize))) { + blkStack = blkStack->stackNext; + + // add a block to the sorted list + } else { + + // remove the block from the yx-sorted list + if (blk3) { + blk3->next = blk4->next; + } else { + yxBlocks = blk4->next; + } + blk4->next = NULL; + + // append the block to the reading-order list + if (blk0) { + blk0->next = blk4; + } else { + blocks = blk4; + } + blk0 = blk4; + + // push the block on the traversal stack + blk4->stackNext = blkStack; + blkStack = blk4; + } + } + } // (!rawOrder) + +#if 0 // for debugging + printf("*** blocks in reading order (after merging) ***\n"); + for (blk0 = blocks; blk0; blk0 = blk0->next) { + printf("[block: x=%.2f..%.2f y=%.2f..%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); } - printf("'\n"); } } - printf("\n------------------------------------------------------------\n\n"); + printf("\n"); + fflush(stdout); #endif - // do column assignment - for (blk1 = xyBlocks; blk1; blk1 = blk1->xyNext) { - col1 = 0; - for (blk2 = xyBlocks; blk2 != blk1; blk2 = blk2->xyNext) { - if (blk1->xMin >= blk2->xMax) { - d = (int)((blk1->xMin - blk2->xMax) / - (0.4 * (blk1->yMax - blk1->yMin))); - if (d > 4) { - d = 4; + //----- assemble blocks into flows + + if (rawOrder) { + + // one flow per block + flow0 = NULL; + while (blocks) { + flow1 = new TextFlow(); + flow1->blocks = blocks; + flow1->lines = blocks->lines; + flow1->yMin = blocks->yMin; + flow1->yMax = blocks->yMax; + blocks = blocks->next; + flow1->blocks->next = NULL; + if (flow0) { + flow0->next = flow1; + } else { + flows = flow1; + } + flow0 = flow1; + } + + } else { + + // compute whitespace above and below each block + for (blk0 = blocks; blk0; blk0 = blk0->next) { + blk0->ySpaceT = 0; + blk0->ySpaceB = pageHeight; + + // check each horizontally overlapping block + for (blk1 = blocks; blk1; blk1 = blk1->next) { + if (blk1 != blk0 && + blk1->xMin < blk0->xMax && + blk1->xMax > blk0->xMin) { + if (blk1->yMax < blk0->yMin) { + if (blk1->yMax > blk0->ySpaceT) { + blk0->ySpaceT = blk1->yMax; + } + } else if (blk1->yMin > blk0->yMax) { + if (blk1->yMin < blk0->ySpaceB) { + blk0->ySpaceB = blk1->yMin; + } + } } - col2 = blk2->col[0] + blk2->convertedLen + d; - if (col2 > col1) { - col1 = col2; + } + } + + flow0 = NULL; + while (blocks) { + + // build a new flow object + flow1 = new TextFlow(); + flow1->blocks = blocks; + flow1->lines = blocks->lines; + flow1->yMin = blocks->yMin; + flow1->yMax = blocks->yMax; + flow1->ySpaceT = blocks->ySpaceT; + flow1->ySpaceB = blocks->ySpaceB; + + // find subsequent blocks in the flow + for (blk1 = blocks, blk2 = blocks->next; + blk2 && flowFit(flow1, blk2); + blk1 = blk2, blk2 = blk2->next) { + if (blk2->yMin < flow1->yMin) { + flow1->yMin = blk2->yMin; } - } else if (blk1->xMin > blk2->xMin) { - for (i = 0; i < blk2->len && blk1->xMin >= blk2->xRight[i]; ++i) ; - col2 = blk2->col[i]; - if (col2 > col1) { - col1 = col2; + if (blk2->yMax > flow1->yMax) { + flow1->yMax = blk2->yMax; + } + if (blk2->ySpaceT > flow1->ySpaceT) { + flow1->ySpaceT = blk2->ySpaceT; } + if (blk2->ySpaceB < flow1->ySpaceB) { + flow1->ySpaceB = blk2->ySpaceB; + } + for (line1 = blk1->lines; line1->next; line1 = line1->next) ; + line1->flowNext = blk2->lines; + } + + // chop the block list + blocks = blk1->next; + blk1->next = NULL; + + // append the flow to the list + if (flow0) { + flow0->next = flow1; + } else { + flows = flow1; } + flow0 = flow1; } - for (j = 0; j < blk1->len; ++j) { - blk1->col[j] += col1; + } + +#if 0 // for debugging + printf("*** flows ***\n"); + for (flow0 = flows; flow0; flow0 = flow0->next) { + printf("[flow]\n"); + for (blk0 = flow0->blocks; blk0; blk0 = blk0->next) { + printf(" [block: x=%.2f..%.2f y=%.2f..%.2f ySpaceT=%.2f ySpaceB=%.2f]\n", + blk0->xMin, blk0->xMax, blk0->yMin, blk0->yMax, + blk0->ySpaceT, blk0->ySpaceB); + for (line0 = blk0->lines; line0; line0 = line0->next) { + printf(" [line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } } } + printf("\n"); + fflush(stdout); +#endif + + //----- sort lines into yx order -#if 0 //~ for debugging - for (line = lines; line; line = line->next) { - printf("[line]\n"); - for (blk = line->blocks; blk; blk = blk->next) { - printf("[block: col=%d, len=%d]\n", blk->col[0], blk->len); - TextString *str; - for (str = blk->strings; str; str = str->next) { - printf(" x=%.2f..%.2f y=%.2f..%.2f size=%.2f '", - str->xMin, str->xMax, str->yMin, str->yMax, - (str->yMax - str->yMin)); - for (i = 0; i < str->len; ++i) { - fputc(str->text[i] & 0xff, stdout); + // (the block/line merging process doesn't maintain the full-page + // linked list of lines) + + lines = NULL; + if (rawOrder) { + line0 = NULL; + for (flow0 = flows; flow0; flow0 = flow0->next) { + for (line1 = flow0->lines; line1; line1 = line1->flowNext) { + if (line0) { + line0->pageNext = line1; + } else { + lines = line1; } - if (str->spaceAfter) { - printf(" [space]\n"); + line0 = line1; + } + } + } else { + for (flow0 = flows; flow0; flow0 = flow0->next) { + for (line0 = flow0->lines; line0; line0 = line0->flowNext) { + for (line1 = NULL, line2 = lines; + line2 && !line0->yxBefore(line2); + line1 = line2, line2 = line2->pageNext) ; + if (line1) { + line1->pageNext = line0; + } else { + lines = line0; } - printf("'\n"); + line0->pageNext = line2; } } } - printf("\n------------------------------------------------------------\n\n"); + +#if 0 // for debugging + printf("*** lines in yx order ***\n"); + for (line0 = lines; line0; line0 = line0->pageNext) { + printf("[line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f xSpaceL=%.2f xSpaceR=%.2f col=%d len=%d]\n", + line0->xMin, line0->xMax, line0->yMin, line0->yMax, + line0->yBase, line0->xSpaceL, line0->xSpaceR, line0->col[0], + line0->len); + for (word0 = line0->words; word0; word0 = word0->next) { + printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f space=%d: '", + word0->xMin, word0->xMax, word0->yMin, word0->yMax, + word0->yBase, word0->spaceAfter); + for (i = 0; i < word0->len; ++i) { + fputc(word0->text[i] & 0xff, stdout); + } + printf("'\n"); + } + } + printf("\n"); + fflush(stdout); #endif } +// Returns a non-negative number if can be added to +// (whose last word is ). A smaller return value indicates +// a better fit. If cannot be added to at all, returns +// a negative number. +double TextPage::lineFit(TextLine *line, TextWord *lastWord, TextWord *word) { + double fontSize0, fontSize1; + double dx, dxLimit; + + fontSize0 = line->fontSize; + fontSize1 = word->fontSize; + dx = word->xMin - lastWord->xMax; + dxLimit = fontSize0 * line->font->maxSpaceWidth; + + // check inter-word spacing + if (dx < fontSize0 * lineMinDeltaX || + dx > dxLimit) { + return -1; + } + + // ensure a non-negative return value + if (dx < 0) { + dx = 0; + } + + // look for adjacent words with close baselines and close font sizes + if (fabs(line->yBase - word->yBase) < lineMaxBaselineDelta * fontSize0 && + fontSize0 < lineMaxFontSizeRatio * fontSize1 && + fontSize1 < lineMaxFontSizeRatio * fontSize0) { + return dx; + } + + // look for a superscript + if (fontSize1 > lineMinSuperscriptFontSizeRatio * fontSize0 && + fontSize1 < lineMaxSuperscriptFontSizeRatio * fontSize0 && + (word->yMax < lastWord->yMax || + word->yBase < lastWord->yBase) && + word->yMax - lastWord->yMin > lineMinSuperscriptOverlap * fontSize0 && + dx < fontSize0 * lineMaxSuperscriptDeltaX) { + return dx; + } + + // look for a subscript + if (fontSize1 > lineMinSubscriptFontSizeRatio * fontSize0 && + fontSize1 < lineMaxSubscriptFontSizeRatio * fontSize0 && + (word->yMin > lastWord->yMin || + word->yBase > lastWord->yBase) && + line->yMax - word->yMin > lineMinSubscriptOverlap * fontSize0 && + dx < fontSize0 * lineMaxSubscriptDeltaX) { + return dx; + } + + return -1; +} + +// Returns true if and can be merged into a single +// line, ignoring max word spacing. +GBool TextPage::lineFit2(TextLine *line0, TextLine *line1) { + double fontSize0, fontSize1; + double dx; + + fontSize0 = line0->fontSize; + fontSize1 = line1->fontSize; + dx = line1->xMin - line0->xMax; + + // check inter-word spacing + if (dx < fontSize0 * lineMinDeltaX) { + return gFalse; + } + + // look for close baselines and close font sizes + if (fabs(line0->yBase - line1->yBase) < lineMaxBaselineDelta * fontSize0 && + fontSize0 < lineMaxFontSizeRatio * fontSize1 && + fontSize1 < lineMaxFontSizeRatio * fontSize0) { + return gTrue; + } + + return gFalse; +} + +// Returns true if can be added to . Assumes the y +// coordinates are within range. +GBool TextPage::blockFit(TextBlock *blk, TextLine *line) { + double fontSize0, fontSize1; + + // check edges + if (line->xMin < blk->xSpaceL || + line->xMax > blk->xSpaceR || + blk->xMin < line->xSpaceL || + blk->xMax > line->xSpaceR) { + return gFalse; + } + + // check font sizes + fontSize0 = blk->lines->fontSize; + fontSize1 = line->fontSize; + if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || + fontSize1 > blkMaxFontSizeRatio * fontSize0) { + return gFalse; + } + + return gTrue; +} + +// Returns true if and can be merged into a single +// block. Assumes the y coordinates are within range. +GBool TextPage::blockFit2(TextBlock *blk0, TextBlock *blk1) { + double fontSize0, fontSize1; + + // check edges + if (blk1->xMin < blk0->xSpaceL || + blk1->xMax > blk0->xSpaceR || + blk0->xMin < blk1->xSpaceL || + blk0->xMax > blk1->xSpaceR) { + return gFalse; + } + + // check font sizes + fontSize0 = blk0->lines->fontSize; + fontSize1 = blk1->lines->fontSize; + if (fontSize0 > blkMaxFontSizeRatio * fontSize1 || + fontSize1 > blkMaxFontSizeRatio * fontSize0) { + return gFalse; + } + + return gTrue; +} + +// Returns true if can be added to . +GBool TextPage::flowFit(TextFlow *flow, TextBlock *blk) { + double dy; + + // check whitespace above and below + if (blk->yMin < flow->ySpaceT || + blk->yMax > flow->ySpaceB || + flow->yMin < blk->ySpaceT || + flow->yMax > blk->ySpaceB) { + return gFalse; + } + + // check that block top edge is within +/- dy of flow top edge, + // and that block bottom edge is above flow bottom edge + dy + dy = flowMaxDeltaY * flow->blocks->maxFontSize; + return blk->yMin > flow->yMin - dy && + blk->yMin < flow->yMin + dy && + blk->yMax < flow->yMax + dy; +} + GBool TextPage::findText(Unicode *s, int len, GBool top, GBool bottom, double *xMin, double *yMin, double *xMax, double *yMax) { TextLine *line; - TextBlock *blk; Unicode *p; Unicode u1, u2; int m, i, j; double x0, x1, x; - // scan all blocks on page - for (line = lines; line; line = line->next) { - for (blk = line->blocks; blk; blk = blk->next) { + // scan all text on the page + for (line = lines; line; line = line->pageNext) { - // check: above top limit? - if (!top && (blk->yMax < *yMin || - (blk->yMin < *yMin && blk->xMax <= *xMin))) { - continue; - } + // check: above top limit? + if (!top && (line->yMax < *yMin || + (line->yMin < *yMin && line->xMax <= *xMin))) { + continue; + } - // check: below bottom limit? - if (!bottom && (blk->yMin > *yMax || - (blk->yMax > *yMax && blk->xMin >= *xMax))) { - return gFalse; - } + // check: below bottom limit? + if (!bottom && (line->yMin > *yMax || + (line->yMax > *yMax && line->xMin >= *xMax))) { + return gFalse; + } - // search each position in this block - m = blk->len; - for (i = 0, p = blk->text; i <= m - len; ++i, ++p) { + // search each position in this line + m = line->len; + for (i = 0, p = line->text; i <= m - len; ++i, ++p) { - x0 = (i == 0) ? blk->xMin : blk->xRight[i-1]; - x1 = blk->xRight[i]; - x = 0.5 * (x0 + x1); + x0 = (i == 0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + x = 0.5 * (x0 + x1); - // check: above top limit? - if (!top && blk->yMin < *yMin) { - if (x < *xMin) { - continue; - } + // check: above top limit? + if (!top && line->yMin < *yMin) { + if (x < *xMin) { + continue; } + } - // check: below bottom limit? - if (!bottom && blk->yMax > *yMax) { - if (x > *xMax) { - return gFalse; - } + // check: below bottom limit? + if (!bottom && line->yMax > *yMax) { + if (x > *xMax) { + return gFalse; } + } - // compare the strings - for (j = 0; j < len; ++j) { + // compare the strings + for (j = 0; j < len; ++j) { #if 1 //~ this lowercases Latin A-Z only -- this will eventually be - //~ extended to handle other character sets - if (p[j] >= 0x41 && p[j] <= 0x5a) { - u1 = p[j] + 0x20; - } else { - u1 = p[j]; - } - if (s[j] >= 0x41 && s[j] <= 0x5a) { - u2 = s[j] + 0x20; - } else { - u2 = s[j]; - } + //~ extended to handle other character sets + if (p[j] >= 0x41 && p[j] <= 0x5a) { + u1 = p[j] + 0x20; + } else { + u1 = p[j]; + } + if (s[j] >= 0x41 && s[j] <= 0x5a) { + u2 = s[j] + 0x20; + } else { + u2 = s[j]; + } #endif - if (u1 != u2) { - break; - } + if (u1 != u2) { + break; } + } - // found it - if (j == len) { - *xMin = x0; - *xMax = blk->xRight[i + len - 1]; - *yMin = blk->yMin; - *yMax = blk->yMax; - return gTrue; - } + // found it + if (j == len) { + *xMin = x0; + *xMax = line->xRight[i + len - 1]; + *yMin = line->yMin; + *yMax = line->yMax; + return gTrue; } } } @@ -870,8 +1692,7 @@ GString *TextPage::getText(double xMin, double yMin, GBool isUnicode; char space[8], eol[16], buf[8]; int spaceLen, eolLen, len; - TextLine *line; - TextBlock *blk; + TextLine *line, *prevLine; double x0, x1, y; int firstCol, col, i; GBool multiLine; @@ -899,40 +1720,32 @@ GString *TextPage::getText(double xMin, double yMin, } // find the leftmost column - multiLine = gFalse; firstCol = -1; - for (line = lines; line; line = line->next) { + for (line = lines; line; line = line->pageNext) { if (line->yMin > yMax) { break; } - if (line->yMax < yMin) { - continue; - } - - for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ; - if (!blk || blk->xMin > xMax) { + if (line->yMax < yMin || + line->xMax < xMin || + line->xMin > xMax) { continue; } - y = 0.5 * (blk->yMin + blk->yMax); + y = 0.5 * (line->yMin + line->yMax); if (y < yMin || y > yMax) { continue; } - if (firstCol >= 0) { - multiLine = gTrue; - } - i = 0; while (1) { - x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; - x1 = blk->xRight[i]; + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; if (0.5 * (x0 + x1) > xMin) { break; } ++i; } - col = blk->col[i]; + col = line->col[i]; if (firstCol < 0 || col < firstCol) { firstCol = col; @@ -940,83 +1753,82 @@ GString *TextPage::getText(double xMin, double yMin, } // extract the text - for (line = lines; line; line = line->next) { + col = firstCol; + multiLine = gFalse; + for (prevLine = NULL, line = lines; + line; + prevLine = line, line = line->pageNext) { if (line->yMin > yMax) { break; } - if (line->yMax < yMin) { - continue; - } - - for (blk = line->blocks; blk && blk->xMax < xMin; blk = blk->next) ; - if (!blk || blk->xMin > xMax) { + if (line->yMax < yMin || + line->xMax < xMin || + line->xMin > xMax) { continue; } - y = 0.5 * (blk->yMin + blk->yMax); + y = 0.5 * (line->yMin + line->yMax); if (y < yMin || y > yMax) { continue; } i = 0; while (1) { - x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; - x1 = blk->xRight[i]; + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; if (0.5 * (x0 + x1) > xMin) { break; } ++i; } - col = firstCol; - - do { - - // line this block up with the correct column - for (; col < blk->col[i]; ++col) { - s->append(space, spaceLen); - } + // insert a return + if (col > line->col[i] || + (prevLine && + line->yMin > + prevLine->yMax - lineOverlapSlack * prevLine->fontSize)) { + s->append(eol, eolLen); + col = firstCol; + multiLine = gTrue; + } - // print the block - for (; i < blk->len; ++i) { + // line this block up with the correct column + for (; col < line->col[i]; ++col) { + s->append(space, spaceLen); + } - x0 = (i==0) ? blk->xMin : blk->xRight[i-1]; - x1 = blk->xRight[i]; - if (0.5 * (x0 + x1) > xMax) { - break; - } + // print the portion of the line + for (; i < line->len; ++i) { - len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf)); - s->append(buf, len); - col += isUnicode ? 1 : len; - } - if (i < blk->len) { + x0 = (i==0) ? line->xMin : line->xRight[i-1]; + x1 = line->xRight[i]; + if (0.5 * (x0 + x1) > xMax) { break; } - // next block - blk = blk->next; - i = 0; - - } while (blk && blk->xMin < xMax); - - if (multiLine) { - s->append(eol, eolLen); + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + s->append(buf, len); + col += isUnicode ? 1 : len; } } + if (multiLine) { + s->append(eol, eolLen); + } + uMap->decRefCnt(); return s; } -void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { +void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout) { UnicodeMap *uMap; char space[8], eol[16], eop[8], buf[8]; int spaceLen, eolLen, eopLen, len; + TextFlow *flow; TextLine *line; - TextBlock *blk; - int col, d, i; + int col, d, n, i; // get the output encoding if (!(uMap = globalParams->getTextEncoding())) { @@ -1038,142 +1850,121 @@ void TextPage::dump(void *outputStream, TextOutputFunc outputFunc) { } eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop)); - // output - for (line = lines; line; line = line->next) { + // output the page, maintaining the original physical layout + if (physLayout || rawOrder) { col = 0; - for (blk = line->blocks; blk; blk = blk->next) { + for (line = lines; line; line = line->pageNext) { // line this block up with the correct column - if (rawOrder && col == 0) { - col = blk->col[0]; - } else { - for (; col < blk->col[0]; ++col) { + if (!rawOrder) { + for (; col < line->col[0]; ++col) { (*outputFunc)(outputStream, space, spaceLen); } } - // print the block - for (i = 0; i < blk->len; ++i) { - len = uMap->mapUnicode(blk->text[i], buf, sizeof(buf)); + // print the line + for (i = 0; i < line->len; ++i) { + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); (*outputFunc)(outputStream, buf, len); } - col += blk->convertedLen; - } + col += line->convertedLen; + + // print one or more returns if necessary + if (!line->pageNext || + line->pageNext->col[0] < col || + line->pageNext->yMin > + line->yMax - lineOverlapSlack * line->fontSize) { + + // compute number of returns + d = 1; + if (line->pageNext) { + d += (int)((line->pageNext->yMin - line->yMax) / + line->fontSize + 0.5); + } + + // various things (weird font matrices) can result in bogus + // values here, so do a sanity check + if (d < 1) { + d = 1; + } else if (d > 5) { + d = 5; + } + for (; d > 0; --d) { + (*outputFunc)(outputStream, eol, eolLen); + } - // print a return - (*outputFunc)(outputStream, eol, eolLen); - - // print extra vertical space if necessary - if (line->next) { - d = (int)((line->next->yMin - line->yMax) / - (line->blocks->strings->yMax - lines->blocks->strings->yMin) - + 0.5); - // various things (weird font matrices) can result in bogus - // values here, so do a sanity check - if (rawOrder && d > 2) { - d = 2; - } else if (!rawOrder && d > 5) { - d = 5; + col = 0; } - for (; d > 0; --d) { - (*outputFunc)(outputStream, eol, eolLen); + } + + // output the page, "undoing" the layout + } else { + for (flow = flows; flow; flow = flow->next) { + for (line = flow->lines; line; line = line->flowNext) { + n = line->len; + if (line->flowNext && line->hyphenated) { + --n; + } + for (i = 0; i < n; ++i) { + len = uMap->mapUnicode(line->text[i], buf, sizeof(buf)); + (*outputFunc)(outputStream, buf, len); + } + if (line->flowNext && !line->hyphenated) { + (*outputFunc)(outputStream, space, spaceLen); + } } + (*outputFunc)(outputStream, eol, eolLen); + (*outputFunc)(outputStream, eol, eolLen); } } // end of page - (*outputFunc)(outputStream, eol, eolLen); (*outputFunc)(outputStream, eop, eopLen); (*outputFunc)(outputStream, eol, eolLen); uMap->decRefCnt(); } -// Returns true if should be inserted before in xy -// order. -GBool TextPage::xyBefore(TextString *str1, TextString *str2) { - return str1->xMin < str2->xMin || - (str1->xMin == str2->xMin && str1->yMin < str2->yMin); -} - -// Returns true if should be inserted before in xy -// order. -GBool TextPage::xyBefore(TextBlock *blk1, TextBlock *blk2) { - return blk1->xMin < blk2->xMin || - (blk1->xMin == blk2->xMin && blk1->yMin < blk2->yMin); -} - -// Returns true if should be inserted before in yx -// order, allowing a little slack for vertically overlapping text. -GBool TextPage::yxBefore(TextBlock *blk1, TextBlock *blk2) { - double h1, h2, overlap; - - h1 = blk1->yMax - blk1->yMin; - h2 = blk2->yMax - blk2->yMin; - overlap = ((blk1->yMax < blk2->yMax ? blk1->yMax : blk2->yMax) - - (blk1->yMin > blk2->yMin ? blk1->yMin : blk2->yMin)) / - (h1 < h2 ? h1 : h2); - if (overlap > 0.6) { - return blk1->xMin < blk2->xMin; - } - return blk1->yMin < blk2->yMin; -} - -double TextPage::coalesceFit(TextString *str1, TextString *str2) { - double h1, h2, w1, w2, r, overlap, spacing; - - h1 = str1->yMax - str1->yMin; - h2 = str2->yMax - str2->yMin; - w1 = str1->xMax - str1->xMin; - w2 = str2->xMax - str2->xMin; - r = h1 / h2; - if (r < (1.0 / 3.0) || r > 3) { - return 10; - } - overlap = ((str1->yMax < str2->yMax ? str1->yMax : str2->yMax) - - (str1->yMin > str2->yMin ? str1->yMin : str2->yMin)) / - (h1 < h2 ? h1 : h2); - if (overlap < 0.5) { - return 10; - } - spacing = (str2->xMin - str1->xMax) / (h1 > h2 ? h1 : h2); - if (spacing < -0.5) { - return 10; - } - // separate text that overlaps - duplicated text (so that fake - // boldface and shadowed text can be cleanly removed) - if ((str2->xMin - str1->xMax) / (w1 < w2 ? w1 : w2) < -0.7) { - return 10; - } - return spacing; +void TextPage::startPage(GfxState *state) { + clear(); + pageWidth = state->getPageWidth(); + pageHeight = state->getPageHeight(); } void TextPage::clear() { - TextLine *p1, *p2; - TextString *s1, *s2; + TextWord *w1, *w2; + TextFlow *f1, *f2; - if (curStr) { - delete curStr; - curStr = NULL; + if (curWord) { + delete curWord; + curWord = NULL; } - if (lines) { - for (p1 = lines; p1; p1 = p2) { - p2 = p1->next; - delete p1; + if (words) { + for (w1 = words; w1; w1 = w2) { + w2 = w1->next; + delete w1; } - } else if (xyStrings) { - for (s1 = xyStrings; s1; s1 = s2) { - s2 = s1->next; - delete s1; + } else if (flows) { + for (f1 = flows; f1; f1 = f2) { + f2 = f1->next; + delete f1; } } - xyStrings = NULL; - xyCur1 = xyCur2 = NULL; - lines = NULL; + deleteGList(fonts, TextFontInfo); + + curWord = NULL; + font = NULL; + fontSize = 0; nest = 0; nTinyChars = 0; + words = wordPtr = NULL; + lines = NULL; + flows = NULL; + fonts = new GList(); + } + //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ @@ -1182,8 +1973,10 @@ static void outputToFile(void *stream, char *text, int len) { fwrite(text, 1, len, (FILE *)stream); } -TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) { +TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append) { text = NULL; + physLayout = physLayoutA; rawOrder = rawOrderA; ok = gTrue; @@ -1205,16 +1998,17 @@ TextOutputDev::TextOutputDev(char *fileName, GBool rawOrderA, GBool append) { } // set up text object - text = new TextPage(rawOrder); + text = new TextPage(rawOrderA); } TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, - GBool rawOrderA) { + GBool physLayoutA, GBool rawOrderA) { outputFunc = func; outputStream = stream; needClose = gFalse; + physLayout = physLayoutA; rawOrder = rawOrderA; - text = new TextPage(rawOrder); + text = new TextPage(rawOrderA); ok = gTrue; } @@ -1231,13 +2025,13 @@ TextOutputDev::~TextOutputDev() { } void TextOutputDev::startPage(int pageNum, GfxState *state) { - text->clear(); + text->startPage(state); } void TextOutputDev::endPage() { text->coalesce(); if (outputStream) { - text->dump(outputStream, outputFunc); + text->dump(outputStream, outputFunc, physLayout); } } @@ -1246,18 +2040,18 @@ void TextOutputDev::updateFont(GfxState *state) { } void TextOutputDev::beginString(GfxState *state, GString *s) { - text->beginString(state, state->getCurX(), state->getCurY()); + text->beginWord(state, state->getCurX(), state->getCurY()); } void TextOutputDev::endString(GfxState *state) { - text->endString(); + text->endWord(); } void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, Unicode *u, int uLen) { - text->addChar(state, x, y, dx, dy, u, uLen); + text->addChar(state, x, y, dx, dy, c, u, uLen); } GBool TextOutputDev::findText(Unicode *s, int len, @@ -1272,3 +2066,5 @@ GString *TextOutputDev::getText(double xMin, double yMin, return text->getText(xMin, yMin, xMax, yMax); } + + diff --git a/pdf/xpdf/TextOutputDev.h b/pdf/xpdf/TextOutputDev.h index f681ecf..8e94f04 100644 --- a/pdf/xpdf/TextOutputDev.h +++ b/pdf/xpdf/TextOutputDev.h @@ -20,12 +20,10 @@ #include "GfxFont.h" #include "OutputDev.h" -class GfxState; class GString; -class TextBlock; -class TextLine; - -#undef TEXTOUT_DO_SYMBOLS +class GList; +class GfxFont; +class GfxState; //------------------------------------------------------------------------ @@ -33,43 +31,165 @@ typedef void (*TextOutputFunc)(void *stream, char *text, int len); //------------------------------------------------------------------------ -// TextString +// TextFontInfo //------------------------------------------------------------------------ -class TextString { +class TextFontInfo { +public: + + TextFontInfo(GfxState *state); + ~TextFontInfo(); + + GBool matches(GfxState *state); + +private: + + GfxFont *gfxFont; + double horizScaling; + + double minSpaceWidth; // min width for inter-word space, as a + // fraction of the font size + double maxSpaceWidth; // max width for inter-word space, as a + // fraction of the font size + + + friend class TextWord; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextWord +//------------------------------------------------------------------------ + +class TextWord { public: // Constructor. - TextString(GfxState *state, double x0, double y0, - double fontSize); + TextWord(GfxState *state, double x0, double y0, + TextFontInfo *fontA, double fontSize); // Destructor. - ~TextString(); + ~TextWord(); - // Add a character to the string. + // Add a character to the word. void addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u); + private: + GBool xyBefore(TextWord *word2); + void merge(TextWord *word2); + double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates - union { - GBool marked; // temporary flag used by coalesce() - GBool spaceAfter; // insert a space after this string? - }; + double yBase; // baseline y coordinate Unicode *text; // the text double *xRight; // right-hand x coord of each char int len; // length of text and xRight int size; // size of text and xRight arrays - TextString *next; + TextFontInfo *font; // font information + double fontSize; // font size + GBool spaceAfter; // set if there is a space between this + // word and the next word on the line + TextWord *next; // next word in line (before lines are + // assembled: next word in xy order) + + friend class TextLine; friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +class TextLine { +public: + + TextLine(); + ~TextLine(); + +private: + + GBool yxBefore(TextLine *line2); + void merge(TextLine *line2); + + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double yBase; // primary baseline y coordinate + double xSpaceL, xSpaceR; // whitespace to left and right of this line + TextFontInfo *font; // primary font + double fontSize; // primary font size + TextWord *words; // words in this line + Unicode *text; // Unicode text of the line, including + // spaces between words + double *xRight; // right-hand x coord of each Unicode char + int *col; // starting column number of each Unicode char + int len; // number of Unicode chars + int convertedLen; // total number of converted characters + GBool hyphenated; // set if last char is a hyphen + TextLine *pageNext; // next line on page + TextLine *next; // next line in block + TextLine *flowNext; // next line in flow + friend class TextBlock; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +class TextBlock { +public: + + TextBlock(); + ~TextBlock(); + +private: + + GBool yxBefore(TextBlock *blk2); + void mergeRight(TextBlock *blk2); + void mergeBelow(TextBlock *blk2); + + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double xSpaceL, xSpaceR; // whitespace to left and right of this block + double ySpaceT, ySpaceB; // whitespace above and below this block + double maxFontSize; // max primary font size + TextLine *lines; // lines in block + TextBlock *next; // next block in flow + TextBlock *stackNext; // next block on traversal stack + + friend class TextFlow; + friend class TextPage; }; //------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +class TextFlow { +public: + + TextFlow(); + ~TextFlow(); + +private: + + double yMin, yMax; // bounding box y coordinates + double ySpaceT, ySpaceB; // whitespace above and below this flow + TextBlock *blocks; // blocks in flow + TextLine *lines; // lines in flow + TextFlow *next; // next flow on page + + friend class TextPage; +}; + + +//------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ @@ -77,7 +197,7 @@ class TextPage { public: // Constructor. - TextPage(GBool rawOrderA); + TextPage(GBool rawOrder); // Destructor. ~TextPage(); @@ -86,18 +206,19 @@ public: void updateFont(GfxState *state); - // Begin a new string. - void beginString(GfxState *state, double x0, double y0); + // Begin a new word. + void beginWord(GfxState *state, double x0, double y0); - // Add a character to the current string. + // Add a character to the current word. void addChar(GfxState *state, double x, double y, - double dx, double dy, Unicode *u, int uLen); + double dx, double dy, + CharCode c, Unicode *u, int uLen); - // End the current string, sorting it into the list of strings. - void endString(); + // End the current word, sorting it into the list of words. + void endWord(); - // Add a string, sorting it into the list of strings. - void addString(TextString *str); + // Add a word, sorting it into the list of words. + void addWord(TextWord *word); // Coalesce strings that look like parts of the same line. @@ -118,31 +239,41 @@ public: double xMax, double yMax); // Dump contents of page to a file. - void dump(void *outputStream, TextOutputFunc outputFunc); + void dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout); + + // Start a new page. + void startPage(GfxState *state); - // Clear the page. - void clear(); private: - GBool xyBefore(TextString *str1, TextString *str2); - GBool xyBefore(TextBlock *blk1, TextBlock *blk2); - GBool yxBefore(TextBlock *blk1, TextBlock *blk2); - double coalesceFit(TextString *str1, TextString *str2); + void clear(); + double lineFit(TextLine *line, TextWord *lastWord, TextWord *word); + GBool lineFit2(TextLine *line0, TextLine *line1); + GBool blockFit(TextBlock *blk, TextLine *line); + GBool blockFit2(TextBlock *blk0, TextBlock *blk1); + GBool flowFit(TextFlow *flow, TextBlock *blk); - GBool rawOrder; // keep strings in content stream order + GBool rawOrder; // keep text in content stream order - TextString *curStr; // currently active string + double pageWidth, pageHeight; // width and height of current page + TextWord *curWord; // currently active string + TextFontInfo *font; // current font double fontSize; // current font size + int nest; // current nesting level (for Type 3 fonts) + int nTinyChars; // number of "tiny" chars seen so far - TextString *xyStrings; // strings in x-major order (before - // they're sorted into lines) - TextString *xyCur1, *xyCur2; // cursors for xyStrings list - TextLine *lines; // list of lines + TextWord *words; // words, in xy order (before they're + // sorted into lines) + TextWord *wordPtr; // cursor for the word list - int nest; // current nesting level (for Type 3 fonts) + TextLine *lines; // lines, in xy order + TextFlow *flows; // flows, in reading order + + GList *fonts; // all font info objects used on this + // page [TextFontInfo] - int nTinyChars; // number of "tiny" chars seen so far }; @@ -155,12 +286,18 @@ public: // Open a text output file. If is NULL, no file is // written (this is useful, e.g., for searching text). If - // is true, the text is kept in content stream order. - TextOutputDev(char *fileName, GBool rawOrderA, GBool append); + // is true, the original physical layout of the text + // is maintained. If is true, the text is kept in + // content stream order. + TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append); // Create a TextOutputDev which will write to a generic stream. If - // is true, the text is kept in content stream order. - TextOutputDev(TextOutputFunc func, void *stream, GBool rawOrderA); + // is true, the original physical layout of the text + // is maintained. If is true, the text is kept in + // content stream order. + TextOutputDev(TextOutputFunc func, void *stream, + GBool physLayoutA, GBool rawOrderA); // Destructor. virtual ~TextOutputDev(); @@ -221,6 +358,7 @@ public: GString *getText(double xMin, double yMin, double xMax, double yMax); + private: TextOutputFunc outputFunc; // output function @@ -228,6 +366,8 @@ private: GBool needClose; // need to close the output file? // (only if outputStream is a FILE*) TextPage *text; // text for the current page + GBool physLayout; // maintain original physical layout when + // dumping text GBool rawOrder; // keep text in content stream order GBool ok; // set up ok? diff --git a/pdf/xpdf/XOutputDev.cc b/pdf/xpdf/XOutputDev.cc index 3c58f56..f9064fa 100644 --- a/pdf/xpdf/XOutputDev.cc +++ b/pdf/xpdf/XOutputDev.cc @@ -828,6 +828,7 @@ XOutputFont *XOutputFontCache::getFont(XRef *xref, GfxFont *gfxFont, } #endif break; + case fontCIDType0: case fontCIDType0C: #if FREETYPE2 && (HAVE_FREETYPE_FREETYPE_H || HAVE_FREETYPE_H) if (freetypeControl != fontRastNone) { @@ -993,7 +994,7 @@ XOutputFont *XOutputFontCache::tryGetFont(XRef *xref, DisplayFontParam *dfp, case displayFontT1: #if HAVE_T1LIB_H - if (t1libControl != fontRastNone) { + if (t1libControl != fontRastNone && !gfxFont->isCIDFont()) { font = tryGetT1FontFromFile(xref, dfp->t1.fileName, gFalse, gfxFont, m11Orig, m12Orig, m21Orig, m22Orig, m11, m12, m21, m22, subst); @@ -1304,7 +1305,7 @@ XOutputFont *XOutputFontCache::tryGetFTFontFromFile(XRef *xref, fontFile = new FTFontFile(ftEngine, fileName->getCString(), ((GfxCIDFont *)gfxFont)->getCIDToGID(), ((GfxCIDFont *)gfxFont)->getCIDToGIDLen()); - } else { // fontCIDType0C + } else { // fontCIDType0, fontCIDType0C fontFile = new FTFontFile(ftEngine, fileName->getCString()); } } else { @@ -1843,7 +1844,7 @@ void XOutputDev::startPage(int pageNum, GfxState *state) { XFillRectangle(display, pixmap, paperGC, 0, 0, pixmapW, pixmapH); // clear text object - text->clear(); + text->startPage(state); } void XOutputDev::endPage() { @@ -1942,6 +1943,9 @@ void XOutputDev::restoreState(GfxState *state) { s = save; save = save->next; delete s; + + // restore the font + updateFont(state); } } @@ -2483,11 +2487,11 @@ void XOutputDev::addPoint(XPoint **points, int *size, int *k, int x, int y) { } void XOutputDev::beginString(GfxState *state, GString *s) { - text->beginString(state, state->getCurX(), state->getCurY()); + text->beginWord(state, state->getCurX(), state->getCurY()); } void XOutputDev::endString(GfxState *state) { - text->endString(); + text->endWord(); } void XOutputDev::drawChar(GfxState *state, double x, double y, @@ -2501,7 +2505,7 @@ void XOutputDev::drawChar(GfxState *state, double x, double y, double *ctm; double saveCTM[6]; - text->addChar(state, x, y, dx, dy, u, uLen); + text->addChar(state, x, y, dx, dy, code, u, uLen); if (!font) { return; @@ -2676,7 +2680,7 @@ GBool XOutputDev::beginType3Char(GfxState *state, } text->addChar(state, 0, 0, t3Font->cacheTags[i+j].wx, t3Font->cacheTags[i+j].wy, - u, uLen); + code, u, uLen); drawType3Glyph(t3Font, &t3Font->cacheTags[i+j], t3Font->cacheData + (i+j) * t3Font->glyphSize, xt, yt, &color); @@ -2755,7 +2759,7 @@ void XOutputDev::endType3Char(GfxState *state) { t3GlyphStack->origCTM4, t3GlyphStack->origCTM5); } text->addChar(state, 0, 0, t3GlyphStack->wx, t3GlyphStack->wy, - t3GlyphStack->u, t3GlyphStack->uLen); + t3GlyphStack->code, t3GlyphStack->u, t3GlyphStack->uLen); t3gs = t3GlyphStack; t3GlyphStack = t3gs->next; delete t3gs; @@ -2850,11 +2854,61 @@ void XOutputDev::type3D1(GfxState *state, double wx, double wy, XRectangle rect; double *ctm; T3FontCache *t3Font; + double xt, yt, xMin, xMax, yMin, yMax, x1, y1; int i, j; + t3Font = t3GlyphStack->cache; + t3GlyphStack->wx = wx; + t3GlyphStack->wy = wy; + + // check for a valid bbox + state->transform(0, 0, &xt, &yt); + state->transform(llx, lly, &x1, &y1); + xMin = xMax = x1; + yMin = yMax = y1; + state->transform(llx, ury, &x1, &y1); + if (x1 < xMin) { + xMin = x1; + } else if (x1 > xMax) { + xMax = x1; + } + if (y1 < yMin) { + yMin = y1; + } else if (y1 > yMax) { + yMax = y1; + } + state->transform(urx, lly, &x1, &y1); + if (x1 < xMin) { + xMin = x1; + } else if (x1 > xMax) { + xMax = x1; + } + if (y1 < yMin) { + yMin = y1; + } else if (y1 > yMax) { + yMax = y1; + } + state->transform(urx, ury, &x1, &y1); + if (x1 < xMin) { + xMin = x1; + } else if (x1 > xMax) { + xMax = x1; + } + if (y1 < yMin) { + yMin = y1; + } else if (y1 > yMax) { + yMax = y1; + } + if (xMin - xt < t3Font->glyphX || + yMin - yt < t3Font->glyphY || + xMax - xt > t3Font->glyphX + t3Font->glyphW || + yMax - yt > t3Font->glyphY + t3Font->glyphH) { + error(-1, "Bad bounding box in Type 3 glyph"); + return; + } + // allocate a cache entry t3GlyphStack->cacheable = gTrue; - t3Font = t3GlyphStack->cache; i = t3GlyphStack->cacheIdx; for (j = 0; j < t3Font->cacheAssoc; ++j) { if ((t3Font->cacheTags[i+j].mru & 0x7fff) == t3Font->cacheAssoc - 1) { @@ -2866,8 +2920,6 @@ void XOutputDev::type3D1(GfxState *state, double wx, double wy, ++t3Font->cacheTags[i+j].mru; } } - t3GlyphStack->wx = wx; - t3GlyphStack->wy = wy; t3GlyphStack->cacheTag->wx = wx; t3GlyphStack->cacheTag->wy = wy; diff --git a/pdf/xpdf/pdftotext.cc b/pdf/xpdf/pdftotext.cc index 150954f..8b13ff2 100644 --- a/pdf/xpdf/pdftotext.cc +++ b/pdf/xpdf/pdftotext.cc @@ -35,6 +35,7 @@ static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt); static int firstPage = 1; static int lastPage = 0; +static GBool physLayout = gFalse; static GBool rawOrder = gFalse; static GBool htmlMeta = gFalse; static char textEncName[128] = ""; @@ -51,6 +52,8 @@ static ArgDesc argDesc[] = { "first page to convert"}, {"-l", argInt, &lastPage, 0, "last page to convert"}, + {"-layout", argFlag, &physLayout, 0, + "maintain original physical layout"}, {"-raw", argFlag, &rawOrder, 0, "keep strings in content stream order"}, {"-htmlmeta", argFlag, &htmlMeta, 0, @@ -222,7 +225,8 @@ int main(int argc, char *argv[]) { } // write text file - textOut = new TextOutputDev(textFileName->getCString(), rawOrder, htmlMeta); + textOut = new TextOutputDev(textFileName->getCString(), + physLayout, rawOrder, htmlMeta); if (textOut->isOk()) { doc->displayPages(textOut, firstPage, lastPage, 72, 0, gFalse); } else { diff --git a/pdf/xpdf/xpdf.cc b/pdf/xpdf/xpdf.cc index ef47fb6..290cfe9 100644 --- a/pdf/xpdf/xpdf.cc +++ b/pdf/xpdf/xpdf.cc @@ -34,6 +34,7 @@ static char ownerPassword[33] = ""; static char userPassword[33] = ""; static GBool fullScreen = gFalse; static char remoteName[100] = "xpdf_"; +static GBool doRemoteReload = gFalse; static GBool doRemoteRaise = gFalse; static GBool doRemoteQuit = gFalse; static GBool printCommands = gFalse; @@ -89,6 +90,8 @@ static ArgDesc argDesc[] = { "run in full-screen (presentation) mode"}, {"-remote", argString, remoteName + 5, sizeof(remoteName) - 5, "start/contact xpdf remote server with specified name"}, + {"-reload", argFlag, &doRemoteReload, 0, + "reload xpdf remove server window (with -remote only)"}, {"-raise", argFlag, &doRemoteRaise, 0, "raise xpdf remote server window (with -remote only)"}, {"-quit", argFlag, &doRemoteQuit, 0, @@ -184,12 +187,15 @@ int main(int argc, char *argv[]) { } // check command line + ok = ok && argc >= 1 && argc <= 3; + if (doRemoteReload) { + ok = ok && remoteName[5] && !doRemoteQuit && argc == 1; + } if (doRemoteRaise) { - ok = ok && remoteName[5] && !doRemoteQuit && argc >= 1 && argc <= 3; - } else if (doRemoteQuit) { + ok = ok && remoteName[5] && !doRemoteQuit; + } + if (doRemoteQuit) { ok = ok && remoteName[5] && argc == 1; - } else { - ok = ok && argc >= 1 && argc <= 3; } if (!ok || printVersion || printHelp) { fprintf(stderr, "xpdf version %s\n", xpdfVersion); @@ -225,6 +231,8 @@ int main(int argc, char *argv[]) { } else { app->remoteOpen(fileName, pg, doRemoteRaise); } + } else if (doRemoteReload) { + app->remoteReload(doRemoteRaise); } else if (doRemoteRaise) { app->remoteRaise(); } else if (doRemoteQuit) { diff --git a/pdf/xpdf/xpdfconfig.h b/pdf/xpdf/xpdfconfig.h index bb6eab9..ef1764a 100644 --- a/pdf/xpdf/xpdfconfig.h +++ b/pdf/xpdf/xpdfconfig.h @@ -14,10 +14,10 @@ //------------------------------------------------------------------------ // xpdf version -#define xpdfVersion "2.00" -#define xpdfVersionNum 2.00 +#define xpdfVersion "2.01" +#define xpdfVersionNum 2.01 #define xpdfMajorVersion 2 -#define xpdfMinorVersion 0 +#define xpdfMinorVersion 1 #define xpdfMajorVersionStr "2" // supported PDF version @@ -28,7 +28,7 @@ #define xpdfCopyright "Copyright 1996-2002 Glyph & Cog, LLC" // Windows resource file stuff -#define winxpdfVersion "WinXpdf 2.00" +#define winxpdfVersion "WinXpdf 2.01" #define xpdfCopyrightAmp "Copyright 1996-2002 Glyph && Cog, LLC" //------------------------------------------------------------------------ -- cgit v0.9.1