#!/bin/bash
# xsane2djvu - djvu directly from xsane
# Copyright (C) 2011 Heinrich Schwietering
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
##############################################################################
#
# xsane2djvu 0.7
#
# *** djvu made simple (well, kind of ...) ***
#
#
##############################################################################
#
# xsane2djvu is a wrapper script to produce djvu files with xsane.
# It "mimics" the segmentation process used by commercial djvu products.
# The scan is converted into a multilayered djvu file using didjvu
# which depends on the gamera-framework to produce a foreground mask.
# Other encoders like cjb2, c44, cpaldjvu or minidjvu are possible as well.
# Finally a text layer is added, using ocrdjvu (OCR engines ocropus or
# cuneiform) and addup the pages to a bundled djvu file.
#
TEMP_DIR=/tmp/ # folder for temporary files (all files)
ERRORLOG="xsane2djvu.log" # file where STDERR goes
if [[ -z "$1" ]]
then
echo "Usage: $0 [OPTIONS]
run the script as OCR-application for xsane!
xsane2djvu uses a ppm files, and produces a 'split' djvu file using
didjvu as standard application, the result is ocred using ocrodjvu
OPTIONS:
-i - temporary XSANE file
-o - name for the single page output
-l - define the language used for recognition
-e - ocropus or cuneiform (with libmagick++-support!), noocr
disables the ocr function
-a <-ocrodjvu-option> - additional ocrodjvu options, optional
-f -complete path to save location for
multipage-djvu, optional, without it a single page djvu is created
-c - usable c44 (no ocr possible), cjb2, cpaldjvu, or minidjvu;
parameter is optional, without didjvu is used as standard
-q <-option for encoder> additional options for encoder used, optional
Progress- & error-messages will be stored in this logfile:
$TEMP_DIR$ERRORLOG
xsane2djvu depends on
- XSane http://www.xsane.org/
- DjVuLibre http://djvu.sourgeforge.net
- minidjvu http://minidjvu.sourceforge.net/
- cuneiform-linux https://launchpad.net/cuneiform-linux Cuneiform-Linux
- indirectly on libmagick-++dev http://www.imagemagick.org/
- or ocropus http://code.google.com/p/ocropus
- didjvu and ocrodjvu http://jwilk.net/software/
- indirectly on gamera http://gamera.informatik.hsnr.de/
Some coding was stolen from 'ocube'
http://www.geocities.com/thierryguy/ocube.html
This djvu adaption is based on xsane2tess
http://doc.ubuntu-fr.org/xsane2tess,
Hints always welcome! heinrich (dot) schwietering (at) gmx (dot) de
"
exit
fi
# get options...
while getopts ":i:o:l:e:a:f:c:q:" OPTION
do
case $OPTION in
i) # input filename (with path)
FILE_PATH="$OPTARG"
;;
o ) # output filename
FILE_OUT="$OPTARG"
;;
l ) # recognition language
REGLANG="$OPTARG"
;;
e ) # engine option
ENGINE="$OPTARG"
;;
a ) # additional ocrodjvu option
OCROPT="$OPTARG"
;;
f ) # final name for bundled djvu file option
FINAL="$OPTARG"
;;
c ) # encoder to use (standard is didjvu)
ENCODER="$OPTARG"
;;
q ) # option for encoder
ENCOPT="$OPTARG"
;;
esac
done
# redirect STDOUT to FILE_OUT
exec 1>>$FILE_OUT
# redirect STDERR to ERRORLOG
exec 2>>$TEMP_DIR$ERRORLOG
# strip path from FILE_PATH, use filename only
IN_FILE="${FILE_PATH##*/.*}"
echo "~~~+++~~~~+++~~~" 1>&2
# check if an encoder is requested
if [[ $ENCODER != '' ]]
then
{
# check if minidjvu is requested
if [[ $ENCODER == "minidjvu" ]]
then
# prepare file for minidjvu
convert "$IN_FILE" "minidjvu.pbm"
echo "file converted" 1>&2
# use minidjvu and rename file
minidjvu $ENCOPT "minidjvu.pbm" "minidjvu.djvu"
echo "$ENCODER $ENCOPT used" 1>&2
cp "minidjvu.djvu" "$FILE_OUT"
else
# use other requested encoder and options
$ENCODER $ENCOPT "$IN_FILE" "$FILE_OUT"
echo "$ENCODER $ENCOPT used" 1>&2
fi
}
else
# produce segmented djvu file with didijvu
didjvu encode $ENCOPT -o "$FILE_OUT" "$IN_FILE"
echo "didjvu encode $ENCOPT used" 1>&2
fi
# check if ocr is requested
if [[ $ENGINE == 'noocr' ]]
then
cp "$FILE_OUT" "$FILE_OUT.djvu"
echo "no ocr used" 1>&2
else
# check if language is set, if not use english
if [[ $REGLANG == "" ]]
then
ocrodjvu --engine=$ENGINE -o "$FILE_OUT.djvu" --language eng $OCROPT "$FILE_OUT" 1>&2
echo "$ENGINE used with language eng $OCROPT" 1>&2
# use requested engine and language for ocr
else
ocrodjvu --engine=$ENGINE -o "$FILE_OUT.djvu" --language "$REGLANG" $OCROPT "$FILE_OUT" 1>&2
echo "$ENGINE used with language $REGLANG $OCROPT" 1>&2
fi
fi
# check if multipage document is requested
if [[ $FINAL != '' ]]
then
{
# check if final file is already existing
if [[ ! -a "$FINAL".djvu ]]
then
# start final djvu file
cp "$FILE_OUT.djvu" "$FINAL".djvu 1>&2
echo "$FINAL started" 1>&2
# insert new djvu file at the end of final file
else
djvm -insert "$FINAL".djvu "$FILE_OUT.djvu" 1>&2
echo "$FILE_OUT added to $FINAL" 1>&2
fi
# remove temporary single page djvu
rm "$FILE_OUT".djvu
}
fi
# Remove djvu without text
rm "$FILE_OUT"
echo "~~~+++~~~~+++~~~" 1>&2