//======================================================================== // // TextOutputDev.h // // Copyright 1997-2003 Glyph & Cog, LLC // //======================================================================== #ifndef TEXTOUTPUTDEV_H #define TEXTOUTPUTDEV_H #include #ifdef USE_GCC_PRAGMAS #pragma interface #endif #include #include "gtypes.h" #include "GfxFont.h" #include "OutputDev.h" class GString; class GList; class GfxFont; class GfxState; class UnicodeMap; //------------------------------------------------------------------------ typedef void (*TextOutputFunc)(void *stream, char *text, int len); //------------------------------------------------------------------------ // TextFontInfo //------------------------------------------------------------------------ class TextFontInfo { public: TextFontInfo(GfxState *state); ~TextFontInfo(); GBool matches(GfxState *state); private: GfxFont *gfxFont; #if TEXTOUT_WORD_LIST GString *fontName; #endif friend class TextWord; friend class TextPage; }; //------------------------------------------------------------------------ // TextWord //------------------------------------------------------------------------ class TextWord { public: // Constructor. TextWord(GfxState *state, int rotA, double x0, double y0, int charPosA, TextFontInfo *fontA, double fontSize); // Destructor. ~TextWord(); // Add a character to the word. void addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u); // Merge onto the end of . void merge(TextWord *word); // Compares to , returning -1 (<), 0 (=), or +1 (>), // based on a primary-axis comparison, e.g., x ordering if rot=0. int primaryCmp(TextWord *word); // Return the distance along the primary axis between and // . double primaryDelta(TextWord *word); static int cmpYX(const void *p1, const void *p2); #if TEXTOUT_WORD_LIST int getLength() { return len; } Unicode getChar(int idx) { return text[idx]; } GString *getText(); GString *getFontName() { return font->fontName; } void getColor(double *r, double *g, double *b) { *r = colorR; *g = colorG; *b = colorB; } void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; } int getCharPos() { return charPos; } int getCharLen() { return charLen; } #endif private: int rot; // rotation, multiple of 90 degrees // (0, 1, 2, or 3) double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double base; // baseline x or y coordinate Unicode *text; // the text double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int len; // length of text and edge arrays int size; // size of text and edge arrays int charPos; // character position (within content stream) int charLen; // number of content stream characters in // this word TextFontInfo *font; // font information double fontSize; // font size GBool spaceAfter; // set if there is a space between this // word and the next word on the line TextWord *next; // next word in line #if TEXTOUT_WORD_LIST double colorR, // word color colorG, colorB; #endif friend class TextPool; friend class TextLine; friend class TextBlock; friend class TextFlow; friend class TextWordList; friend class TextPage; }; //------------------------------------------------------------------------ // TextPool //------------------------------------------------------------------------ class TextPool { public: TextPool(); ~TextPool(); TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; } void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; } int getBaseIdx(double base); void addWord(TextWord *word); private: int minBaseIdx; // min baseline bucket index int maxBaseIdx; // max baseline bucket index TextWord **pool; // array of linked lists, one for each // baseline value (multiple of 4 pts) TextWord *cursor; // pointer to last-accessed word int cursorBaseIdx; // baseline bucket index of last-accessed word friend class TextBlock; friend class TextPage; }; //------------------------------------------------------------------------ // TextLine //------------------------------------------------------------------------ class TextLine { public: TextLine(TextBlock *blkA, int rotA, double baseA); ~TextLine(); void addWord(TextWord *word); // Return the distance along the primary axis between and // . double primaryDelta(TextLine *line); // Compares to , returning -1 (<), 0 (=), or +1 (>), // based on a primary-axis comparison, e.g., x ordering if rot=0. int primaryCmp(TextLine *line); // Compares to , returning -1 (<), 0 (=), or +1 (>), // based on a secondary-axis comparison of the baselines, e.g., y // ordering if rot=0. int secondaryCmp(TextLine *line); int cmpYX(TextLine *line); static int cmpXY(const void *p1, const void *p2); void coalesce(UnicodeMap *uMap); private: TextBlock *blk; // parent block int rot; // text rotation double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double base; // baseline x or y coordinate TextWord *words; // words in this line TextWord *lastWord; // last word in this line Unicode *text; // Unicode text of the line, including // spaces between words double *edge; // "near" edge x or y coord of each char // (plus one extra entry for the last char) int *col; // starting column number of each Unicode char int len; // number of Unicode chars int convertedLen; // total number of converted characters GBool hyphenated; // set if last char is a hyphen TextLine *next; // next line in block friend class TextLineFrag; friend class TextBlock; friend class TextFlow; friend class TextWordList; friend class TextPage; }; //------------------------------------------------------------------------ // TextBlock //------------------------------------------------------------------------ class TextBlock { public: TextBlock(TextPage *pageA, int rotA); ~TextBlock(); void addWord(TextWord *word); void coalesce(UnicodeMap *uMap); // Update this block's priMin and priMax values, looking at . void updatePriMinMax(TextBlock *blk); static int cmpXYPrimaryRot(const void *p1, const void *p2); static int cmpYXPrimaryRot(const void *p1, const void *p2); int primaryCmp(TextBlock *blk); double secondaryDelta(TextBlock *blk); // Returns true if is below , relative to the page's // primary rotation. GBool isBelow(TextBlock *blk); private: TextPage *page; // the parent page int rot; // text rotation double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double priMin, priMax; // whitespace bounding box along primary axis TextPool *pool; // pool of words (used only until lines // are built) TextLine *lines; // linked list of lines TextLine *curLine; // most recently added line int nLines; // number of lines int charCount; // number of characters in the block int col; // starting column int nColumns; // number of columns in the block TextBlock *next; TextBlock *stackNext; friend class TextLine; friend class TextLineFrag; friend class TextFlow; friend class TextWordList; friend class TextPage; }; //------------------------------------------------------------------------ // TextFlow //------------------------------------------------------------------------ class TextFlow { public: TextFlow(TextPage *pageA, TextBlock *blk); ~TextFlow(); // Add a block to the end of this flow. void addBlock(TextBlock *blk); // Returns true if fits below in the flow, i.e., (1) // it uses a font no larger than the last block added to the flow, // and (2) it fits within the flow's [priMin, priMax] along the // primary axis. GBool blockFits(TextBlock *blk, TextBlock *prevBlk); private: TextPage *page; // the parent page double xMin, xMax; // bounding box x coordinates double yMin, yMax; // bounding box y coordinates double priMin, priMax; // whitespace bounding box along primary axis TextBlock *blocks; // blocks in flow TextBlock *lastBlk; // last block in this flow TextFlow *next; friend class TextWordList; friend class TextPage; }; #if TEXTOUT_WORD_LIST //------------------------------------------------------------------------ // TextWordList //------------------------------------------------------------------------ class TextWordList { public: // Build a flat word list, in content stream order (if // text->rawOrder is true), physical layout order (if // is true and text->rawOrder is false), or reading order (if both // flags are false). TextWordList(TextPage *text, GBool physLayout); ~TextWordList(); // Return the number of words on the list. int getLength(); // Return the th word from the list. TextWord *get(int idx); private: GList *words; }; #endif // TEXTOUT_WORD_LIST //------------------------------------------------------------------------ // TextPage //------------------------------------------------------------------------ class TextPage { public: // Constructor. TextPage(GBool rawOrderA); // Destructor. ~TextPage(); // Start a new page. void startPage(GfxState *state); // End the current page. void endPage(); // Update the current font. void updateFont(GfxState *state); // Begin a new word. void beginWord(GfxState *state, double x0, double y0); // Add a character to the current word. void addChar(GfxState *state, double x, double y, double dx, double dy, CharCode c, Unicode *u, int uLen); // End the current word, sorting it into the list of words. void endWord(); // Add a word, sorting it into the list of words. void addWord(TextWord *word); // Coalesce strings that look like parts of the same line. void coalesce(GBool physLayout); // Find a string. If is true, starts looking at the // top of the page; else if is true, starts looking // immediately after the last find result; else starts looking at // ,. If is true, stops looking at the // bottom of the page; else if is true, stops looking // just before the last find result; else stops looking at // ,. GBool findText(Unicode *s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax); // Get the text which is inside the specified rectangle. GString *getText(double xMin, double yMin, double xMax, double yMax); // Find a string by character position and length. If found, sets // the text bounding rectangle and returns true; otherwise returns // false. GBool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax); // Dump contents of page to a file. void dump(void *outputStream, TextOutputFunc outputFunc, GBool physLayout); #if TEXTOUT_WORD_LIST // Build a flat word list, in content stream order (if // this->rawOrder is true), physical layout order (if // is true and this->rawOrder is false), or reading order (if both // flags are false). TextWordList *makeWordList(GBool physLayout); #endif private: void clear(); void assignColumns(TextLineFrag *frags, int nFrags, int rot); int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GString *s); GBool rawOrder; // keep text in content stream order double pageWidth, pageHeight; // width and height of current page TextWord *curWord; // currently active string int charPos; // next character position (within content // stream) TextFontInfo *curFont; // current font double curFontSize; // current font size int nest; // current nesting level (for Type 3 fonts) int nTinyChars; // number of "tiny" chars seen so far GBool lastCharOverlap; // set if the last added char overlapped the // previous char TextPool *pools[4]; // a "pool" of TextWords for each rotation TextFlow *flows; // linked list of flows TextBlock **blocks; // array of blocks, in yx order int nBlocks; // number of blocks int primaryRot; // primary rotation GBool primaryLR; // primary direction (true means L-to-R, // false means R-to-L) TextWord *rawWords; // list of words, in raw order (only if // rawOrder is set) TextWord *rawLastWord; // last word on rawWords list GList *fonts; // all font info objects used on this // page [TextFontInfo] double lastFindXMin, // coordinates of the last "find" result lastFindYMin; GBool haveLastFind; friend class TextLine; friend class TextLineFrag; friend class TextBlock; friend class TextFlow; friend class TextWordList; }; //------------------------------------------------------------------------ // TextOutputDev //------------------------------------------------------------------------ class TextOutputDev: public OutputDev { public: // Open a text output file. If is NULL, no file is // written (this is useful, e.g., for searching text). If // is true, the original physical layout of the text // is maintained. If is true, the text is kept in // content stream order. TextOutputDev(char *fileName, GBool physLayoutA, GBool rawOrderA, GBool append); // Create a TextOutputDev which will write to a generic stream. If // is true, the original physical layout of the text // is maintained. If is true, the text is kept in // content stream order. TextOutputDev(TextOutputFunc func, void *stream, GBool physLayoutA, GBool rawOrderA); // Destructor. virtual ~TextOutputDev(); // Check if file was successfully created. virtual GBool isOk() { return ok; } //---- get info about output device // Does this device use upside-down coordinates? // (Upside-down means (0,0) is the top left corner of the page.) virtual GBool upsideDown() { return gTrue; } // Does this device use drawChar() or drawString()? virtual GBool useDrawChar() { return gTrue; } // Does this device use beginType3Char/endType3Char? Otherwise, // text in Type 3 fonts will be drawn with drawChar/drawString. virtual GBool interpretType3Chars() { return gFalse; } // Does this device need non-text content? virtual GBool needNonText() { return gFalse; } //----- initialization and control // Start a page. virtual void startPage(int pageNum, GfxState *state); // End a page. virtual void endPage(); //----- update text state virtual void updateFont(GfxState *state); //----- text drawing virtual void beginString(GfxState *state, GString *s); virtual void endString(GfxState *state); virtual void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, Unicode *u, int uLen); //----- special access // Find a string. If is true, starts looking at the // top of the page; else if is true, starts looking // immediately after the last find result; else starts looking at // ,. If is true, stops looking at the // bottom of the page; else if is true, stops looking // just before the last find result; else stops looking at // ,. GBool findText(Unicode *s, int len, GBool startAtTop, GBool stopAtBottom, GBool startAtLast, GBool stopAtLast, double *xMin, double *yMin, double *xMax, double *yMax); // Get the text which is inside the specified rectangle. GString *getText(double xMin, double yMin, double xMax, double yMax); // Find a string by character position and length. If found, sets // the text bounding rectangle and returns true; otherwise returns // false. GBool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax); #if TEXTOUT_WORD_LIST // Build a flat word list, in content stream order (if // this->rawOrder is true), physical layout order (if // this->physLayout is true and this->rawOrder is false), or reading // order (if both flags are false). TextWordList *makeWordList(); #endif private: TextOutputFunc outputFunc; // output function void *outputStream; // output stream GBool needClose; // need to close the output file? // (only if outputStream is a FILE*) TextPage *text; // text for the current page GBool physLayout; // maintain original physical layout when // dumping text GBool rawOrder; // keep text in content stream order GBool ok; // set up ok? }; #endif