diff options
Diffstat (limited to 'pdf/xpdf/TextOutputDev.h')
-rw-r--r-- | pdf/xpdf/TextOutputDev.h | 228 |
1 files changed, 184 insertions, 44 deletions
diff --git a/pdf/xpdf/TextOutputDev.h b/pdf/xpdf/TextOutputDev.h index f681ecf..8e94f04 100644 --- a/pdf/xpdf/TextOutputDev.h +++ b/pdf/xpdf/TextOutputDev.h @@ -20,12 +20,10 @@ #include "GfxFont.h" #include "OutputDev.h" -class GfxState; class GString; -class TextBlock; -class TextLine; - -#undef TEXTOUT_DO_SYMBOLS +class GList; +class GfxFont; +class GfxState; //------------------------------------------------------------------------ @@ -33,43 +31,165 @@ typedef void (*TextOutputFunc)(void *stream, char *text, int len); //------------------------------------------------------------------------ -// TextString +// TextFontInfo //------------------------------------------------------------------------ -class TextString { +class TextFontInfo { +public: + + TextFontInfo(GfxState *state); + ~TextFontInfo(); + + GBool matches(GfxState *state); + +private: + + GfxFont *gfxFont; + double horizScaling; + + double minSpaceWidth; // min width for inter-word space, as a + // fraction of the font size + double maxSpaceWidth; // max width for inter-word space, as a + // fraction of the font size + + + friend class TextWord; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextWord +//------------------------------------------------------------------------ + +class TextWord { public: // Constructor. - TextString(GfxState *state, double x0, double y0, - double fontSize); + TextWord(GfxState *state, double x0, double y0, + TextFontInfo *fontA, double fontSize); // Destructor. - ~TextString(); + ~TextWord(); - // Add a character to the string. + // Add a character to the word. void addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u); + private: + GBool xyBefore(TextWord *word2); + void merge(TextWord *word2); + double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates - union { - GBool marked; // temporary flag used by coalesce() - GBool spaceAfter; // insert a space after this string? - }; + double yBase; // baseline y coordinate Unicode *text; // the text double *xRight; // right-hand x coord of each char int len; // length of text and xRight int size; // size of text and xRight arrays - TextString *next; + TextFontInfo *font; // font information + double fontSize; // font size + GBool spaceAfter; // set if there is a space between this + // word and the next word on the line + TextWord *next; // next word in line (before lines are + // assembled: next word in xy order) + + friend class TextLine; friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextLine +//------------------------------------------------------------------------ + +class TextLine { +public: + + TextLine(); + ~TextLine(); + +private: + + GBool yxBefore(TextLine *line2); + void merge(TextLine *line2); + + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double yBase; // primary baseline y coordinate + double xSpaceL, xSpaceR; // whitespace to left and right of this line + TextFontInfo *font; // primary font + double fontSize; // primary font size + TextWord *words; // words in this line + Unicode *text; // Unicode text of the line, including + // spaces between words + double *xRight; // right-hand x coord of each Unicode char + int *col; // starting column number of each Unicode char + int len; // number of Unicode chars + int convertedLen; // total number of converted characters + GBool hyphenated; // set if last char is a hyphen + TextLine *pageNext; // next line on page + TextLine *next; // next line in block + TextLine *flowNext; // next line in flow + friend class TextBlock; + friend class TextPage; +}; + +//------------------------------------------------------------------------ +// TextBlock +//------------------------------------------------------------------------ + +class TextBlock { +public: + + TextBlock(); + ~TextBlock(); + +private: + + GBool yxBefore(TextBlock *blk2); + void mergeRight(TextBlock *blk2); + void mergeBelow(TextBlock *blk2); + + double xMin, xMax; // bounding box x coordinates + double yMin, yMax; // bounding box y coordinates + double xSpaceL, xSpaceR; // whitespace to left and right of this block + double ySpaceT, ySpaceB; // whitespace above and below this block + double maxFontSize; // max primary font size + TextLine *lines; // lines in block + TextBlock *next; // next block in flow + TextBlock *stackNext; // next block on traversal stack + + friend class TextFlow; + friend class TextPage; }; //------------------------------------------------------------------------ +// TextFlow +//------------------------------------------------------------------------ + +class TextFlow { +public: + + TextFlow(); + ~TextFlow(); + +private: + + double yMin, yMax; // bounding box y coordinates + double ySpaceT, ySpaceB; // whitespace above and below this flow + TextBlock *blocks; // blocks in flow + TextLine *lines; // lines in flow + TextFlow *next; // next flow on page + + friend class TextPage; +}; + + +//------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ @@ -77,7 +197,7 @@ class TextPage { public: // Constructor. - TextPage(GBool rawOrderA); + TextPage(GBool rawOrder); // Destructor. ~TextPage(); @@ -86,18 +206,19 @@ public: void updateFont(GfxState *state); - // Begin a new string. - void beginString(GfxState *state, double x0, double y0); + // Begin a new word. + void beginWord(GfxState *state, double x0, double y0); - // Add a character to the current string. + // Add a character to the current word. void addChar(GfxState *state, double x, double y, - double dx, double dy, Unicode *u, int uLen); + double dx, double dy, + CharCode c, Unicode *u, int uLen); - // End the current string, sorting it into the list of strings. - void endString(); + // End the current word, sorting it into the list of words. + void endWord(); - // Add a string, sorting it into the list of strings. - void addString(TextString *str); + // Add a word, sorting it into the list of words. + void addWord(TextWord *word); // Coalesce strings that look like parts of the same line. @@ -118,31 +239,41 @@ public: double xMax, double yMax); // Dump contents of page to a file. - void dump(void *outputStream, TextOutputFunc outputFunc); + void dump(void *outputStream, TextOutputFunc outputFunc, + GBool physLayout); + + // Start a new page. + void startPage(GfxState *state); - // Clear the page. - void clear(); private: - GBool xyBefore(TextString *str1, TextString *str2); - GBool xyBefore(TextBlock *blk1, TextBlock *blk2); - GBool yxBefore(TextBlock *blk1, TextBlock *blk2); - double coalesceFit(TextString *str1, TextString *str2); + void clear(); + double lineFit(TextLine *line, TextWord *lastWord, TextWord *word); + GBool lineFit2(TextLine *line0, TextLine *line1); + GBool blockFit(TextBlock *blk, TextLine *line); + GBool blockFit2(TextBlock *blk0, TextBlock *blk1); + GBool flowFit(TextFlow *flow, TextBlock *blk); - GBool rawOrder; // keep strings in content stream order + GBool rawOrder; // keep text in content stream order - TextString *curStr; // currently active string + double pageWidth, pageHeight; // width and height of current page + TextWord *curWord; // currently active string + TextFontInfo *font; // current font double fontSize; // current font size + int nest; // current nesting level (for Type 3 fonts) + int nTinyChars; // number of "tiny" chars seen so far - TextString *xyStrings; // strings in x-major order (before - // they're sorted into lines) - TextString *xyCur1, *xyCur2; // cursors for xyStrings list - TextLine *lines; // list of lines + TextWord *words; // words, in xy order (before they're + // sorted into lines) + TextWord *wordPtr; // cursor for the word list - int nest; // current nesting level (for Type 3 fonts) + TextLine *lines; // lines, in xy order + TextFlow *flows; // flows, in reading order + + GList *fonts; // all font info objects used on this + // page [TextFontInfo] - int nTinyChars; // number of "tiny" chars seen so far }; @@ -155,12 +286,18 @@ public: // Open a text output file. If <fileName> is NULL, no file is // written (this is useful, e.g., for searching text). If - // <rawOrder> is true, the text is kept in content stream order. - TextOutputDev(char *fileName, GBool rawOrderA, GBool append); + // <physLayoutA> is true, the original physical layout of the text + // is maintained. If <rawOrder> is true, the text is kept in + // content stream order. + TextOutputDev(char *fileName, GBool physLayoutA, + GBool rawOrderA, GBool append); // Create a TextOutputDev which will write to a generic stream. If - // <rawOrder> is true, the text is kept in content stream order. - TextOutputDev(TextOutputFunc func, void *stream, GBool rawOrderA); + // <physLayoutA> is true, the original physical layout of the text + // is maintained. If <rawOrder> is true, the text is kept in + // content stream order. + TextOutputDev(TextOutputFunc func, void *stream, + GBool physLayoutA, GBool rawOrderA); // Destructor. virtual ~TextOutputDev(); @@ -221,6 +358,7 @@ public: GString *getText(double xMin, double yMin, double xMax, double yMax); + private: TextOutputFunc outputFunc; // output function @@ -228,6 +366,8 @@ private: GBool needClose; // need to close the output file? // (only if outputStream is a FILE*) TextPage *text; // text for the current page + GBool physLayout; // maintain original physical layout when + // dumping text GBool rawOrder; // keep text in content stream order GBool ok; // set up ok? |