Anhang: xsane2tess4.sh

#!/bin/bash
#    xsane2tess3 - tesseractOCR directly from xsane
#    Copyright (C) 2012-2019 Heinrich Schwietering
# 
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#
##############################################################################
#
#                                   xsane2tess4 0.1
#
#                          *** tesseract made simple ***
#
#
##############################################################################
#
# xsane2tess is a TesseractOCR wrapper to be able to use tesseract with xsane
#
#
#
TEMP_DIR=/tmp/      # folder for temporary files (all files)
ERRORLOG="xsane2tess4.log"  # file where STDERR goes 

#TEST="testoutput.txt"

if [[ -z "$1"  ]]
  then
  echo "Usage: $0 [OPTIONS]

  xsane2tess4 scans images with TesseractOCR
  and outputs text in a file, a PDF or as hocr/html document

  OPTIONS:
    -i <file1>  define input file (any image-format supported)
    -o <file2>  define output-file (*.txt/hOCR/pdf)
    -l <lang>  define language-data tesseract should use
    -e <config> filename for tesseract
    -f </path/to/Final> name and path for multiscan document

  Progress- & error-messages will be stored in this logfile:
     $TEMP_DIR$ERRORLOG

  xsane2tess depends on
    - XSane, http://www.xsane.org/
    - TesseractOCR, http://code.google.com/p/tesseract-ocr/
    - pdfunite from poppler-utils
    - econvert from exact-image

  Some coding was stolen from 'ocube'
  http://www.geocities.com/thierryguy/ocube.html
  
  This adaption is based on xsane2tess  
  http://doc.ubuntu-fr.org/xsane2tess, 

  Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de

"
  exit
fi


# get options...
while getopts ":i:o:l:c:f:" OPTION
  do
  case $OPTION in
    i )  # input filename (with path)
      FILE_PATH="$OPTARG"
    ;;
    o )  # output filename
      FILE_OUT="$OPTARG"
    ;;
    l )  # Language-selection
      LANG="$OPTARG"
    ;;
    c )  # use hocr configfile
      CONFILE="$OPTARG"
    ;;
    f )  # final name for multiscan ocr file
      FINAL="$OPTARG"
    ;;
  esac
done

# redirect STDOUT to FILE_OUT
exec 1>>$FILE_OUT

# redirect STDERR to ERRORLOG
exec 2>>$TEMP_DIR$ERRORLOG

# strip path from FILE_PATH, use filename only
IN_FILE="${FILE_PATH##*/.*}"

echo "~~~+++~~~~+++~~~Start "$(date +%c) 1>&2

# convert input to png with 300 dpi; tesseract 4.xx can't recognise the 
# resölution of XSanes internal ppm file format, which leads to gigantic pdf files
econvert -i "$IN_FILE" --resolution 300 -o IN_FILE.png 1>&2

# start OCR (tesseract expands output with *.txt/.html/.pdf)
{ if [[ "$CONFILE" == '' ]]
	then
	tesseract IN_FILE.png "$FILE_OUT" -l "$LANG" 1>&2
	echo Tesseract used with -l "$LANG" 1>&2
	else
	tesseract IN_FILE.png "$FILE_OUT" -l "$LANG" "$CONFILE" 1>&2
	echo Tesseract used with -l "$LANG" and "$CONFILE" 1>&2
fi }

{ if [[ "$FINAL" != '' ]]
  then
	{ if [[ "$CONFILE" == "" ]]
	then
# check if final txt file is already existing
    		{ if [[ ! -a "$FINAL".txt ]]
     		then
# start final ocr txt file
        	mv "$FILE_OUT".txt "$FINAL".txt 1>&2
		echo "$FINAL.txt started" 1>&2
		else
                mv "$FINAL".txt "$FINAL".new.txt
		cat "$FINAL".new.txt "$FILE_OUT".txt > "$FINAL".txt
		echo "$FILE_OUT. txt added to $FINAL.txt" 1>&2
		rm "$FINAL".new.txt
		fi }
	else
# check if final hocr/pdf file is already existing
   		{ if [[ ! -a "$FINAL"."$CONFILE" ]]
    			then
# start final hocr/pdf file
        		mv "$FILE_OUT"."$CONFILE" "$FINAL"."$CONFILE" 1>&2
			echo "$FINAL.$CONFILE started" 1>&2
			else
# add new file to existing one
                	mv "$FINAL"."$CONFILE" "$FINAL".new."$CONFILE"
			{ if [[ "$CONFILE" == "pdf" ]]
				then
				pdfunite "$FINAL".new."$CONFILE" "$FILE_OUT"."$CONFILE" "$FINAL"."$CONFILE"
	 			else
				cat "$FINAL".new."$CONFILE" "$FILE_OUT"."$CONFILE" > "$FINAL"."$CONFILE"
			fi }
		echo "$FILE_OUT"."$CONFILE" added to "$FINAL"."$CONFILE" 1>&2
		rm "$FINAL".new."$CONFILE"
		fi }
	fi }

	rm $FILE_OUT	
	else
# STDOUT scanned text => FILE_OUT
	cat "$FILE_OUT".*

fi }

rm $FILE_OUT.*
rm IN_FILE.png
echo "~~~+++~~~~+++~~~Finished "$(date +%c) 1>&2
Anhang herunterladen
Wrapper-Skript für tesseract zu Nutzung aus XSane heraus
Diese Revision wurde am 26. März 2019 15:29 von Heinrich_Schwietering erstellt.