1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
//========================================================================
//
// pdftotext.cc
//
// Copyright 1997 Derek B. Noonburg
//
//========================================================================
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include "parseargs.h"
#include "GString.h"
#include "gmem.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "TextOutputDev.h"
#include "Params.h"
#include "Error.h"
#include "config.h"
static int firstPage = 1;
static int lastPage = 0;
static GBool useASCII7 = gFalse;
#if JAPANESE_SUPPORT
static GBool useEUCJP = gFalse;
#endif
static GBool rawOrder = gFalse;
GBool printCommands = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to convert"},
{"-l", argInt, &lastPage, 0,
"last page to convert"},
{"-ascii7", argFlag, &useASCII7, 0,
"convert to 7-bit ASCII (default is 8-bit ISO Latin-1)"},
#if JAPANESE_SUPPORT
{"-eucjp", argFlag, &useEUCJP, 0,
"convert Japanese text to EUC-JP"},
#endif
{"-raw", argFlag, &rawOrder, 0,
"keep strings in content stream order"},
{"-q", argFlag, &errQuiet, 0,
"don't print any messages or errors"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
int main(int argc, char *argv[]) {
PDFDoc *doc;
GString *fileName;
GString *textFileName;
TextOutputDev *textOut;
GBool ok;
char *p;
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc < 2 || argc > 3 || printHelp) {
fprintf(stderr, "pdftotext version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
exit(1);
}
fileName = new GString(argv[1]);
// init error file
errorInit();
// read config file
initParams(xpdfConfigFile);
// open PDF file
xref = NULL;
doc = new PDFDoc(new FileStream (fileOpen(fileName)), fileName);
if (!doc->isOk()) {
goto err1;
}
// check for copy permission
if (!doc->okToCopy()) {
error(-1, "Copying of text from this document is not allowed.");
goto err2;
}
// construct text file name
if (argc == 3) {
textFileName = new GString(argv[2]);
} else {
p = fileName->getCString() + fileName->getLength() - 4;
if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
textFileName = new GString(fileName->getCString(),
fileName->getLength() - 4);
else
textFileName = fileName->copy();
textFileName->append(".txt");
}
// get page range
if (firstPage < 1)
firstPage = 1;
if (lastPage < 1 || lastPage > doc->getNumPages())
lastPage = doc->getNumPages();
// write text file
#if JAPANESE_SUPPORT
useASCII7 |= useEUCJP;
#endif
textOut = new TextOutputDev(textFileName->getCString(), useASCII7, rawOrder);
if (textOut->isOk())
doc->displayPages(textOut, firstPage, lastPage, 72, 0);
delete textOut;
// clean up
delete textFileName;
err2:
delete doc;
err1:
freeParams();
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return 0;
}
|