diff options
Diffstat (limited to 'leptonica/prog/cleanpdf.c')
-rw-r--r-- | leptonica/prog/cleanpdf.c | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/leptonica/prog/cleanpdf.c b/leptonica/prog/cleanpdf.c new file mode 100644 index 00000000..ae4ee093 --- /dev/null +++ b/leptonica/prog/cleanpdf.c @@ -0,0 +1,278 @@ +/*====================================================================* + - Copyright (C) 2001 Leptonica. All rights reserved. + - + - Redistribution and use in source and binary forms, with or without + - modification, are permitted provided that the following conditions + - are met: + - 1. Redistributions of source code must retain the above copyright + - notice, this list of conditions and the following disclaimer. + - 2. Redistributions in binary form must reproduce the above + - copyright notice, this list of conditions and the following + - disclaimer in the documentation and/or other materials + - provided with the distribution. + - + - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY + - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *====================================================================*/ + +/* + * cleanpdf.c + * + * This program is intended to take as input pdf files that have + * been constructed from poorly compressed images -- typically images + * that have been scanned in grayscale or color but should be rendered + * in black and white (1 bpp). It cleans and compresses them, and + * generates a pdf composed of tiff-g4 compressed images. + * + * It will also take as input clean, orthographically-generated pdfs, + * and concatenate them into a single pdf file of images. + * + * Syntax: cleanpdf basedir threshold resolution outfile [rotation] + * + * The %basedir is a directory where the input pdf files are located. + * The program will operate on every file in this directory with + * the ".pdf" extension. + * + * The input binarization %threshold should be somewhere in the + * range [130 - 190]. The result is typically not very sensitive to + * the value, because internally we use a pixel mapping that is adapted + * to the local background before thresholding to binarize the image. + * + * The output %resolution parameter can take on two values: + * 300 (binarize at the same resolution as the gray or color input, + * which is typically 300 ppi) + * 600 (binarize at twice the resolution of the gray or color input, + * by doing an interpolated 2x expansion on the grayscale + * image, followed by thresholding to 1 bpp) + * At 300 ppi, an 8.5 x 11 page would have 2550 x 3300 pixels. + * You can also input 0 for the default output resolution of 300 ppi. + * + * The pdf output is written to %outfile. It is advisable (but not + * required) to have a '.pdf' extension. + * + * The optional %rotation is an integer: + * 0 no rotation + * 1 90 degrees cw + * 1 180 degrees cw + * 1 270 degrees cw + * + * Whenever possible, the images will be deskewed. + * + * Notes on using filenames with internal spaces. + * * The file-handling functions in leptonica do not support filenames + * that have spaces. To use cleanpdf in linux with such input + * filenames, substitute an ascii character for the spaces; e.g., '^'. + * char *newstr = stringReplaceEachSubstr(str, " ", "^", NULL); + * Then run cleanpdf on the file(s). + * * To get an output filename with spaces, use single quotes; e.g., + * cleanpdf dir thresh res 'filename with spaces' + * + * N.B. This requires the Poppler package of pdf utilities, such as + * pdfimages and pdftoppm. For non-unix systems, this requires + * installation of the cygwin Poppler package: + * https://cygwin.com/cgi-bin2/package-cat.cgi?file=x86/poppler/ + * poppler-0.26.5-1 + */ + +#ifdef HAVE_CONFIG_H +#include <config_auto.h> +#endif /* HAVE_CONFIG_H */ + +#ifdef _WIN32 +# if defined(_MSC_VER) || defined(__MINGW32__) +# include <direct.h> +# else +# include <io.h> +# endif /* _MSC_VER || __MINGW32__ */ +#endif /* _WIN32 */ + + /* Set to 1 to use pdftoppm; 0 for pdfimages */ +#define USE_PDFTOPPM 1 + +#include "string.h" +#include <sys/stat.h> +#include <sys/types.h> +#include "allheaders.h" + + /* Special version */ +PIX *pixConvertTo8Special(PIX *pix); + +l_int32 main(int argc, + char **argv) +{ +char buf[256]; +char *basedir, *fname, *tail, *basename, *imagedir, *outfile, *firstpath; +l_int32 thresh, res, rotation, i, n, ret; +PIX *pixs, *pix1, *pix2, *pix3, *pix4, *pix5; +SARRAY *sa; +static char mainName[] = "cleanpdf"; + + if (argc != 5 && argc != 6) + return ERROR_INT( + "Syntax: cleanpdf basedir threshold resolution outfile [rotation]", + mainName, 1); + basedir = argv[1]; + thresh = atoi(argv[2]); + res = atoi(argv[3]); + outfile = argv[4]; + if (argc == 6) + rotation = atoi(argv[5]); + else + rotation = 0; + if (rotation < 0 || rotation > 3) { + L_ERROR("rotation not in valid set {0,1,2,3}; setting to 0", mainName); + rotation = 0; + } + if (res == 0) + res = 300; + if (res != 300 && res != 600) { + L_ERROR("invalid res = %d; res must be in {0, 300, 600}\n", + mainName, res); + return 1; + } + setLeptDebugOK(1); + +#if 1 + /* Get the names of the pdf files */ + if ((sa = getSortedPathnamesInDirectory(basedir, "pdf", 0, 0)) == NULL) + return ERROR_INT("files not found", mainName, 1); + sarrayWriteStream(stderr, sa); + n = sarrayGetCount(sa); +#endif + + /* Rasterize: use either + * pdftoppm -r 300 fname outroot (-r 300 renders output at 300 ppi) + * or + * pdfimages -j fname outroot (-j outputs jpeg if input is dct) + * Use of pdftoppm: + * This works on all pdf pages, both wrapped images and pages that + * were made orthographically. The default output resolution for + * pdftoppm is 150 ppi, but we use 300 ppi. This makes large + * uncompressed files (e.g., a standard size RGB page image at 300 + * ppi is 25 MB), but it is very fast. This is now preferred over + * using pdfimages. + * Use of pdfimages: + * This only works when all pages are pdf wrappers around images. + * In some cases, it scrambles the order of the output pages + * and inserts extra images. */ + imagedir = stringJoin(basedir, "/image"); +#if 1 +#ifndef _WIN32 + mkdir(imagedir, 0777); +#else + _mkdir(imagedir); +#endif /* _WIN32 */ + for (i = 0; i < n; i++) { + fname = sarrayGetString(sa, i, L_NOCOPY); + splitPathAtDirectory(fname, NULL, &tail); + splitPathAtExtension(tail, &basename, NULL); + #if USE_PDFTOPPM + snprintf(buf, sizeof(buf), "pdftoppm -r 300 %s %s/%s", + fname, imagedir, basename); + #else + snprintf(buf, sizeof(buf), "pdfimages -j %s %s/%s", + fname, imagedir, basename); + #endif /* USE_PDFTOPPM */ + lept_free(tail); + lept_free(basename); + fprintf(stderr, "%s\n", buf); + ret = system(buf); /* pdfimages or pdftoppm */ + } + sarrayDestroy(&sa); +#endif + +#if 1 + /* Clean, deskew and compress */ + sa = getSortedPathnamesInDirectory(imagedir, NULL, 0, 0); + sarrayWriteStream(stderr, sa); + n = sarrayGetCount(sa); + firstpath = NULL; + for (i = 0; i < n; i++) { + fname = sarrayGetString(sa, i, L_NOCOPY); + pixs = pixRead(fname); + pix1 = pixConvertTo8Special(pixs); + if (rotation > 0) + pix2 = pixRotateOrth(pix1, rotation); + else + pix2 = pixClone(pix1); + pix3 = pixFindSkewAndDeskew(pix2, 2, NULL, NULL); + pix4 = pixBackgroundNormSimple(pix3, NULL, NULL); + pixGammaTRC(pix4, pix4, 2.0, 50, 250); + if (res == 300) + pix5 = pixThresholdToBinary(pix4, thresh); + else /* res == 600 */ + pix5 = pixScaleGray2xLIThresh(pix4, thresh); + splitPathAtDirectory(fname, NULL, &tail); + splitPathAtExtension(tail, &basename, NULL); + snprintf(buf, sizeof(buf), "%s/%s.tif", imagedir, basename); + fprintf(stderr, "%s\n", buf); + pixWrite(buf, pix5, IFF_TIFF_G4); + if (i == 0) /* save full path to first image */ + firstpath = stringNew(buf); + pixDestroy(&pixs); + pixDestroy(&pix1); + pixDestroy(&pix2); + pixDestroy(&pix3); + pixDestroy(&pix4); + pixDestroy(&pix5); + lept_free(tail); + lept_free(basename); + } + sarrayDestroy(&sa); +#endif + +#if 1 + /* Generate the pdf. Compute the actual input resolution from + * the pixel dimensions of the first image. This will cause each + * page to be printed to cover an 8.5 x 11 inch sheet of paper. + * We use flate encoding to avoid photometric reversal which + * happens when encoded with G4 tiff. */ + fprintf(stderr, "Write output to %s\n", outfile); + pix1 = pixRead(firstpath); + pixInferResolution(pix1, 11.0, &res); + pixDestroy(&pix1); + lept_free(firstpath); + convertFilesToPdf(imagedir, "tif", res, 1.0, L_G4_ENCODE, + 0, NULL, outfile); +#endif + + return 0; +} + + + /* A special version of pixConvertTo8() that returns an image without + * a colormap and uses pixConvertRGBToGrayMinMax() to strongly + * render color into black. */ +PIX * +pixConvertTo8Special(PIX *pixs) +{ + l_int32 d = pixGetDepth(pixs); + if (d == 1) { + return pixConvert1To8(NULL, pixs, 255, 0); + } else if (d == 2) { + return pixConvert2To8(pixs, 0, 85, 170, 255, FALSE); + } else if (d == 4) { + return pixConvert4To8(pixs, FALSE); + } else if (d == 8) { + if (pixGetColormap(pixs) != NULL) + return pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE); + else + return pixCopy(NULL, pixs); + } else if (d == 16) { + return pixConvert16To8(pixs, L_MS_BYTE); + } else if (d == 32) { + return pixConvertRGBToGrayMinMax(pixs, L_CHOOSE_MIN); + } + + L_ERROR("Invalid depth d = %d\n", "pixConvertSpecialTo8", d); + return NULL; +} |