Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/pdf/xpdf/TextOutputDev.h
diff options
context:
space:
mode:
Diffstat (limited to 'pdf/xpdf/TextOutputDev.h')
-rw-r--r--pdf/xpdf/TextOutputDev.h228
1 files changed, 184 insertions, 44 deletions
diff --git a/pdf/xpdf/TextOutputDev.h b/pdf/xpdf/TextOutputDev.h
index f681ecf..8e94f04 100644
--- a/pdf/xpdf/TextOutputDev.h
+++ b/pdf/xpdf/TextOutputDev.h
@@ -20,12 +20,10 @@
#include "GfxFont.h"
#include "OutputDev.h"
-class GfxState;
class GString;
-class TextBlock;
-class TextLine;
-
-#undef TEXTOUT_DO_SYMBOLS
+class GList;
+class GfxFont;
+class GfxState;
//------------------------------------------------------------------------
@@ -33,43 +31,165 @@ typedef void (*TextOutputFunc)(void *stream, char *text, int len);
//------------------------------------------------------------------------
-// TextString
+// TextFontInfo
//------------------------------------------------------------------------
-class TextString {
+class TextFontInfo {
+public:
+
+ TextFontInfo(GfxState *state);
+ ~TextFontInfo();
+
+ GBool matches(GfxState *state);
+
+private:
+
+ GfxFont *gfxFont;
+ double horizScaling;
+
+ double minSpaceWidth; // min width for inter-word space, as a
+ // fraction of the font size
+ double maxSpaceWidth; // max width for inter-word space, as a
+ // fraction of the font size
+
+
+ friend class TextWord;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextWord
+//------------------------------------------------------------------------
+
+class TextWord {
public:
// Constructor.
- TextString(GfxState *state, double x0, double y0,
- double fontSize);
+ TextWord(GfxState *state, double x0, double y0,
+ TextFontInfo *fontA, double fontSize);
// Destructor.
- ~TextString();
+ ~TextWord();
- // Add a character to the string.
+ // Add a character to the word.
void addChar(GfxState *state, double x, double y,
double dx, double dy, Unicode u);
+
private:
+ GBool xyBefore(TextWord *word2);
+ void merge(TextWord *word2);
+
double xMin, xMax; // bounding box x coordinates
double yMin, yMax; // bounding box y coordinates
- union {
- GBool marked; // temporary flag used by coalesce()
- GBool spaceAfter; // insert a space after this string?
- };
+ double yBase; // baseline y coordinate
Unicode *text; // the text
double *xRight; // right-hand x coord of each char
int len; // length of text and xRight
int size; // size of text and xRight arrays
- TextString *next;
+ TextFontInfo *font; // font information
+ double fontSize; // font size
+ GBool spaceAfter; // set if there is a space between this
+ // word and the next word on the line
+ TextWord *next; // next word in line (before lines are
+ // assembled: next word in xy order)
+
+ friend class TextLine;
friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextLine
+//------------------------------------------------------------------------
+
+class TextLine {
+public:
+
+ TextLine();
+ ~TextLine();
+
+private:
+
+ GBool yxBefore(TextLine *line2);
+ void merge(TextLine *line2);
+
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double yBase; // primary baseline y coordinate
+ double xSpaceL, xSpaceR; // whitespace to left and right of this line
+ TextFontInfo *font; // primary font
+ double fontSize; // primary font size
+ TextWord *words; // words in this line
+ Unicode *text; // Unicode text of the line, including
+ // spaces between words
+ double *xRight; // right-hand x coord of each Unicode char
+ int *col; // starting column number of each Unicode char
+ int len; // number of Unicode chars
+ int convertedLen; // total number of converted characters
+ GBool hyphenated; // set if last char is a hyphen
+ TextLine *pageNext; // next line on page
+ TextLine *next; // next line in block
+ TextLine *flowNext; // next line in flow
+
friend class TextBlock;
+ friend class TextPage;
+};
+
+//------------------------------------------------------------------------
+// TextBlock
+//------------------------------------------------------------------------
+
+class TextBlock {
+public:
+
+ TextBlock();
+ ~TextBlock();
+
+private:
+
+ GBool yxBefore(TextBlock *blk2);
+ void mergeRight(TextBlock *blk2);
+ void mergeBelow(TextBlock *blk2);
+
+ double xMin, xMax; // bounding box x coordinates
+ double yMin, yMax; // bounding box y coordinates
+ double xSpaceL, xSpaceR; // whitespace to left and right of this block
+ double ySpaceT, ySpaceB; // whitespace above and below this block
+ double maxFontSize; // max primary font size
+ TextLine *lines; // lines in block
+ TextBlock *next; // next block in flow
+ TextBlock *stackNext; // next block on traversal stack
+
+ friend class TextFlow;
+ friend class TextPage;
};
//------------------------------------------------------------------------
+// TextFlow
+//------------------------------------------------------------------------
+
+class TextFlow {
+public:
+
+ TextFlow();
+ ~TextFlow();
+
+private:
+
+ double yMin, yMax; // bounding box y coordinates
+ double ySpaceT, ySpaceB; // whitespace above and below this flow
+ TextBlock *blocks; // blocks in flow
+ TextLine *lines; // lines in flow
+ TextFlow *next; // next flow on page
+
+ friend class TextPage;
+};
+
+
+//------------------------------------------------------------------------
// TextPage
//------------------------------------------------------------------------
@@ -77,7 +197,7 @@ class TextPage {
public:
// Constructor.
- TextPage(GBool rawOrderA);
+ TextPage(GBool rawOrder);
// Destructor.
~TextPage();
@@ -86,18 +206,19 @@ public:
void updateFont(GfxState *state);
- // Begin a new string.
- void beginString(GfxState *state, double x0, double y0);
+ // Begin a new word.
+ void beginWord(GfxState *state, double x0, double y0);
- // Add a character to the current string.
+ // Add a character to the current word.
void addChar(GfxState *state, double x, double y,
- double dx, double dy, Unicode *u, int uLen);
+ double dx, double dy,
+ CharCode c, Unicode *u, int uLen);
- // End the current string, sorting it into the list of strings.
- void endString();
+ // End the current word, sorting it into the list of words.
+ void endWord();
- // Add a string, sorting it into the list of strings.
- void addString(TextString *str);
+ // Add a word, sorting it into the list of words.
+ void addWord(TextWord *word);
// Coalesce strings that look like parts of the same line.
@@ -118,31 +239,41 @@ public:
double xMax, double yMax);
// Dump contents of page to a file.
- void dump(void *outputStream, TextOutputFunc outputFunc);
+ void dump(void *outputStream, TextOutputFunc outputFunc,
+ GBool physLayout);
+
+ // Start a new page.
+ void startPage(GfxState *state);
- // Clear the page.
- void clear();
private:
- GBool xyBefore(TextString *str1, TextString *str2);
- GBool xyBefore(TextBlock *blk1, TextBlock *blk2);
- GBool yxBefore(TextBlock *blk1, TextBlock *blk2);
- double coalesceFit(TextString *str1, TextString *str2);
+ void clear();
+ double lineFit(TextLine *line, TextWord *lastWord, TextWord *word);
+ GBool lineFit2(TextLine *line0, TextLine *line1);
+ GBool blockFit(TextBlock *blk, TextLine *line);
+ GBool blockFit2(TextBlock *blk0, TextBlock *blk1);
+ GBool flowFit(TextFlow *flow, TextBlock *blk);
- GBool rawOrder; // keep strings in content stream order
+ GBool rawOrder; // keep text in content stream order
- TextString *curStr; // currently active string
+ double pageWidth, pageHeight; // width and height of current page
+ TextWord *curWord; // currently active string
+ TextFontInfo *font; // current font
double fontSize; // current font size
+ int nest; // current nesting level (for Type 3 fonts)
+ int nTinyChars; // number of "tiny" chars seen so far
- TextString *xyStrings; // strings in x-major order (before
- // they're sorted into lines)
- TextString *xyCur1, *xyCur2; // cursors for xyStrings list
- TextLine *lines; // list of lines
+ TextWord *words; // words, in xy order (before they're
+ // sorted into lines)
+ TextWord *wordPtr; // cursor for the word list
- int nest; // current nesting level (for Type 3 fonts)
+ TextLine *lines; // lines, in xy order
+ TextFlow *flows; // flows, in reading order
+
+ GList *fonts; // all font info objects used on this
+ // page [TextFontInfo]
- int nTinyChars; // number of "tiny" chars seen so far
};
@@ -155,12 +286,18 @@ public:
// Open a text output file. If <fileName> is NULL, no file is
// written (this is useful, e.g., for searching text). If
- // <rawOrder> is true, the text is kept in content stream order.
- TextOutputDev(char *fileName, GBool rawOrderA, GBool append);
+ // <physLayoutA> is true, the original physical layout of the text
+ // is maintained. If <rawOrder> is true, the text is kept in
+ // content stream order.
+ TextOutputDev(char *fileName, GBool physLayoutA,
+ GBool rawOrderA, GBool append);
// Create a TextOutputDev which will write to a generic stream. If
- // <rawOrder> is true, the text is kept in content stream order.
- TextOutputDev(TextOutputFunc func, void *stream, GBool rawOrderA);
+ // <physLayoutA> is true, the original physical layout of the text
+ // is maintained. If <rawOrder> is true, the text is kept in
+ // content stream order.
+ TextOutputDev(TextOutputFunc func, void *stream,
+ GBool physLayoutA, GBool rawOrderA);
// Destructor.
virtual ~TextOutputDev();
@@ -221,6 +358,7 @@ public:
GString *getText(double xMin, double yMin,
double xMax, double yMax);
+
private:
TextOutputFunc outputFunc; // output function
@@ -228,6 +366,8 @@ private:
GBool needClose; // need to close the output file?
// (only if outputStream is a FILE*)
TextPage *text; // text for the current page
+ GBool physLayout; // maintain original physical layout when
+ // dumping text
GBool rawOrder; // keep text in content stream order
GBool ok; // set up ok?