Anhang: xsane2djvu.sh

#!/bin/bash
#    xsane2djvu - djvu directly from xsane
#    Copyright (C) 2011 Heinrich Schwietering
# 
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
##############################################################################
#
#                                   xsane2djvu 0.7
#
#           *** djvu made simple (well, kind of ...) ***
#
#
##############################################################################
# 
# xsane2djvu is a wrapper script to produce djvu files with xsane.
# It "mimics" the segmentation process used by commercial djvu products. 
# The scan is converted into a multilayered djvu file using didjvu 
# which depends on the gamera-framework to produce a foreground mask.
# Other encoders like cjb2, c44, cpaldjvu or minidjvu  are possible as well.
# Finally a text layer is added, using ocrdjvu (OCR engines ocropus or
# cuneiform) and addup the pages to a bundled djvu file.
#

TEMP_DIR=/tmp/      # folder for temporary files (all files)
ERRORLOG="xsane2djvu.log"  # file where STDERR goes 

if [[ -z "$1"  ]]
  then
  echo "Usage: $0 [OPTIONS]
  run the script as OCR-application for xsane!
  xsane2djvu uses a ppm files, and produces a 'split' djvu file using
  didjvu as standard application, the result is ocred using ocrodjvu
  
  OPTIONS:
    -i <filename> - temporary XSANE file
    -o <filename> - name for the single page output
    -l <language> - define the language used for recognition
    -e <ocr engine> - ocropus or cuneiform (with libmagick++-support!), noocr
     disables the ocr function
    -a <-ocrodjvu-option> - additional ocrodjvu options, optional
    -f </path/to/final/destination> -complete path to save location for 
       multipage-djvu, optional, without it a single page djvu is created
    -c <encoder> - usable c44 (no ocr possible), cjb2, cpaldjvu, or minidjvu; 
       parameter is optional, without didjvu is used as standard
    -q <-option for encoder> additional options for encoder used, optional
 
  Progress- & error-messages will be stored in this logfile:
     $TEMP_DIR$ERRORLOG

  xsane2djvu depends on
    - XSane http://www.xsane.org/
    - DjVuLibre http://djvu.sourgeforge.net
    - minidjvu http://minidjvu.sourceforge.net/
    - cuneiform-linux https://launchpad.net/cuneiform-linux Cuneiform-Linux
    - indirectly on libmagick-++dev  http://www.imagemagick.org/
    - or ocropus http://code.google.com/p/ocropus
    - didjvu and ocrodjvu http://jwilk.net/software/
    - indirectly on gamera http://gamera.informatik.hsnr.de/
  Some coding was stolen from 'ocube'
  http://www.geocities.com/thierryguy/ocube.html

  This djvu adaption is based on xsane2tess  
  http://doc.ubuntu-fr.org/xsane2tess, 

  Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de
"
  exit
fi

# get options...
while getopts ":i:o:l:e:a:f:c:q:" OPTION
  do
  case $OPTION in 
    i)  # input filename (with path)
      FILE_PATH="$OPTARG"
    ;;
    o )  # output filename
      FILE_OUT="$OPTARG"
    ;;
    l )  # recognition language
     REGLANG="$OPTARG"
    ;;
    e )  # engine option 
      ENGINE="$OPTARG"
    ;;
    a )  # additional ocrodjvu option  
      OCROPT="$OPTARG"
    ;;
    f )  # final name for bundled djvu file option 
      FINAL="$OPTARG"
    ;;
    c )  # encoder to use (standard is didjvu)
      ENCODER="$OPTARG"
    ;;
    q )  # option for encoder
      ENCOPT="$OPTARG"
    ;;
  esac
done



# redirect STDOUT to FILE_OUT
exec 1>>$FILE_OUT

# redirect STDERR to ERRORLOG
exec 2>>$TEMP_DIR$ERRORLOG

# strip path from FILE_PATH, use filename only
IN_FILE="${FILE_PATH##*/.*}"

echo "~~~+++~~~~+++~~~" 1>&2

# check if an encoder is requested
if [[ $ENCODER != '' ]]
  then
   { 
# check if minidjvu is requested
     if [[ $ENCODER == "minidjvu" ]]
     then
# prepare file for minidjvu
     convert "$IN_FILE" "minidjvu.pbm"
     echo "file converted" 1>&2
# use minidjvu and rename file
     minidjvu $ENCOPT "minidjvu.pbm" "minidjvu.djvu"
     echo "$ENCODER $ENCOPT used" 1>&2
     cp "minidjvu.djvu" "$FILE_OUT"
     else
# use other requested encoder and options
    $ENCODER $ENCOPT "$IN_FILE" "$FILE_OUT"
    echo "$ENCODER $ENCOPT used" 1>&2
    fi 
    }

  else
# produce segmented djvu file with didijvu
    didjvu encode $ENCOPT -o "$FILE_OUT" "$IN_FILE"
    echo "didjvu encode $ENCOPT used" 1>&2
fi

# check if ocr is requested
if [[ $ENGINE == 'noocr' ]]
    then
       cp "$FILE_OUT" "$FILE_OUT.djvu"
       echo "no ocr used" 1>&2
    else
# check if language is set, if not use english
       if [[ $REGLANG == "" ]]
          then
          ocrodjvu --engine=$ENGINE -o "$FILE_OUT.djvu" --language eng $OCROPT "$FILE_OUT" 1>&2
          echo "$ENGINE used with language eng $OCROPT" 1>&2
# use requested engine and language for ocr
       else
           ocrodjvu --engine=$ENGINE -o "$FILE_OUT.djvu" --language "$REGLANG" $OCROPT "$FILE_OUT" 1>&2
          echo "$ENGINE used with language $REGLANG $OCROPT" 1>&2
       fi
fi

# check if multipage document is requested
if [[ $FINAL != '' ]]
  then
  { 
# check if final file is already existing
    if [[ ! -a "$FINAL".djvu ]]
      then
# start final djvu file
        cp "$FILE_OUT.djvu" "$FINAL".djvu 1>&2
        echo "$FINAL started" 1>&2

# insert new djvu file at the end of final file
      else
       djvm -insert "$FINAL".djvu "$FILE_OUT.djvu" 1>&2 
       echo "$FILE_OUT added to $FINAL" 1>&2
    fi 
# remove temporary single page djvu
    rm "$FILE_OUT".djvu

  }

fi

# Remove djvu without text
rm "$FILE_OUT"
echo "~~~+++~~~~+++~~~" 1>&2
Anhang herunterladen
Diese Revision wurde am 14. Mai 2011 10:00 von Heinrich_Schwietering erstellt.