Anhang: xsane2tess.sh

#!/bin/bash
#
##############################################################################
#
#                                   xsane2tess 1.0
#
#                          *** tesseract made simple ***
#
##############################################################################
#
# xsane2tess is a TesseractOCR wrapper to be able to use tesseract with xsane
#
#
#
TEMP_DIR=~/tmp/      # folder for temporary files (TIFF & tesseract data)
ERRORLOG="xsane2tess.log"  # file where STDERR goes
TEST="testoutput.txt"

if [[ -z "$1"  ]]
   then
   echo "Usage: $0 [OPTIONS]

   xsane2tess converts files to TIF, scans them with TesseractOCR
   and outputs the text in a file.

   OPTIONS:
     -i <file1>  define input file (any image-format supported)
     -o <file2>  define output-file (*.txt)
     -l <lang>  define language-data tesseract should use

   Progress- & error-messages will be stored in this logfile:
      $TEMP_DIR$ERRORLOG

   xsane2tess depends on
     - ImageMagick  http://www.imagemagick.org/
     - TesseractOCR http://code.google.com/p/tesseract-ocr/

   Some coding was stolen from 'ocube'
   http://www.geocities.com/thierryguy/ocube.html
    "
   exit
fi


# get options...
while getopts ":i:o:l:" OPTION
   do
   case $OPTION in
     i)  # input filename (with path)
       FILE_PATH="$OPTARG"
     ;;
     o)  # output filename
       FILEOUT="$OPTARG"
     ;;
     l)  # Language-selection
       TES_LANG="$OPTARG"
     ;;
   esac
done

# redirect STDOUT to FILE_OUT
exec 1>>$FILEOUT

# redirect STDERR to ERRORLOG
exec 2>>$TEMP_DIR$ERRORLOG

# strip path from FILE_PATH, use filename only
IN_FILE=${FILE_PATH##*/}

TIF_FILE="$TEMP_DIR""xsane2tessResult".tif

TXT_FILE="$TEMP_DIR""xsane2tessResult"

# converting image into TIFF (ImageMagick)
convert "$FILE_PATH" -compress none  "$TIF_FILE" 1>&2

# start OCR (tesseract expands output with *.txt)
tesseract "$TIF_FILE" "$TXT_FILE" -l "$TES_LANG" 1>&2

# STDOUT scanned text => FILE_OUT
cat "$TXT_FILE".txt

# delete graphic file after use
rm "$TIF_FILE"

# delete tesseract output
rm "$TXT_FILE".txt
Anhang herunterladen
Version für Tesseract 2.0x, convert nötig!
Diese Revision wurde am 19. September 2012 19:13 von Heinrich_Schwietering erstellt.