#!/usr/bin/python
#
# imgsizer -- correct image sizes in WWW pages
# by Eric S. Raymond <esr@thyrsus.com>
# 
# Fix up IMG tags in given documents to contain correct sizes.
# 
# Works with Python 1.5.2
#
# Copy, use, and redistribute freely, but don't take my name off it and
# clearly mark an altered version.  Fixes and enhancements cheerfully 
# accepted.
#
# Changelog:
#
# Originally created by Eric S. Raymond <esr@thyrsus.com> 30 Jul 1996
#
# Modified by Erik Rossen <rossen@planet.ch> 15 May 1999
#
#    Added the --nomagick switch, to use file(1) and rdjpgcom(1)
#    to determine the image size instead of identify(1) from the
#    ImageMagick suite.
#
# Modified by Michael C. Toren <michael@toren.net> 18 Aug 2000
#
#    Fixed bug where the SRC attribute's value needed to be in quotes,
#    improved command line parsing (but it could still use some work),
#    added -q switch to omit quotes when generating tags, and -l switch
#    to generate lowercase tags.  -mct
#
# Modified by Michael C. Toren <michael@toren.net> 19 Aug 2000
#
#    Improved the command line parsing some more, now looks for additional
#    arguments via an IMGSIZER environmental variable, added the -d switch
#    to set the DocumentRoot, -v switch to display version information,
#    and -h switch to display usage information.  -mct
#
# Modified by Michael C. Toren <michael@toren.net> 23 Feb 2001
#
#    Fixed two bugs reported by Jeroen Valcke <jeroen@valcke.com>, one
#    where the -d switch did not function properly if the img src attribute
#    was quoted, and another where the &error sub was incorrectly reporting
#    the line number an error occurred due to the input record separator
#    being set to ">".
#
# Rewritten in Python by Eric S. Raymond <esr@thyrsus.com> 11 July 2001
#
#    Time to get rid of the dependency on httpget.  The -l option is gone, too;
#    instead, we deduce the right case by looking at the leading tag.  -q
#    is gone; we always emit without quotes.  -m is gone too, instead we
#    try commands in least to most expensive order, and notice when a command
#    returns not to try it again.
#
# Fixes by ESR, 29 July 2001
#
#    Incorporated fixes by Peter S. Galbraith.
#
# Fixes by ESR, 25 April 2003
#
#    Merged amended versions of Lennart Poettering's fix for Debian bug 139714.
#    and Jeroen N. Witmond's fix for Debian bug 168964.  Added regression-test
#    production.
#
# Enhancement by ESR, 14 Nov 2003
#
#    Verify and merge Lucien Saviot's patch to produce XHTML from XHTML input.
#    Also his change to handle spurious lin e breaks produced by Dave Raggett's
#    tidy(1) utility.
#
# Modified by Andrew Gwozdziewycz <gwozdzie@lucas.cis.temple.edu>, 17 June 2004
#
#    Added support for the Python Imaging Library to determine size in case of
#    failure from file(1), rdjpgcom(1) and identify(1).

import sys, os, getopt, string, re, urllib, commands

# Arrange for both 1.5 and 2.1 compatibility
try:
    import filecmp
    cmp = filecmp
    del filecmp
except ImportError:
    import cmp

version = "2.7, 05 Aug 2004";

splash = """imgsizer version %s, Eric S. Raymond <esr@thyrsus.com>
See <http://www.catb.org/~esr/software.html> for updates."""

usage = """Usage: imgsizer [OPTIONS] [HTML File]

Options:

    -V, --version

        Display version information and exit.

    -h, --help

        Display usage information.

    -d <directory>, --document-root <directory>

        Directory where absolute image filenames (i.e, ones which contain
        a leading "/") may be found.

    -n, --no-overwrite

        Don't overwrite existing width and height tags if both are present.

"""

# Optimization latches -- if an attempt  to invoke a command returns 127
# "not found" there will turn off and that command won't be tried again.
magick = 1	# using ImageMagick by default
rdjpgcom = 1	# using rdjpcom by default
pythonimage = 1 # use python imaging library

def attrformat(xc, dim):
    if lower:
        res = " " + dim
    else:
        res = " " + string.upper(dim)
    res = res + '="' + str(xc) + '"'
    return res

def sizefix(infp, outfp):
# Apply attrfix to the attributes in each image tag
    global lower
    while 1:
        ch = infp.read(1)
        if ch == '':
            return
        outfp.write(ch)
        if ch == '<':
            # within an HTML tag
            lead = infp.read(2)
            outfp.write(lead)
            if not lead in ("im", "IM"):
                continue
            # splitting the read this way copes with single-char tags like <b>
            lead = lead + infp.read(1)
            outfp.write(lead[-1])
            if not lead in ("img", "IMG"):
                continue
            # within an image tag
            lower = (lead == 'img')
            state = suppress = 0
            attributes = ""
            while 1:
                ch = infp.read(1)
                if ch == '':
                    return
                if ch == '>':
                    break
                if ch == '/':
                    ch2 = infp.read(1)
                    ch = ch + ch2
                    if ch2 == '>':
                        break
                attributes = attributes + ch
            outfp.write(transform(attributes) + ch)

x_match = re.compile (r" ([0-9]+) *x *([0-9]+)")
rdjpg_match = re.compile (r" ([0-9]+)w *\* *([0-9]+)h")

def imgsize(src):
    "Return the image size in pixels for a given image source."
    global magick, rdjpgcom, pythonimage
    try:
        (filename, headers) = urllib.urlretrieve(src)
    except IOError:
        return None
    # Now let's see if we can get a size for the retrieved image.
    # Try file(1) first -- cheapest, as it doesn't read the whole image
    (status, output) = commands.getstatusoutput("file " + filename)
    if status == 0:
        # file(1) works for every common image format other than JPEG
        if string.find(output, "JPEG") == -1:
            sizes = x_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
        elif rdjpgcom:
            # Use rdjpgcom(1) to handle JPEGs
            (status, output) = commands.getstatusoutput("rdjpgcom -verbose " + filename)
            sizes = rdjpg_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
            elif status == 127:
                rdjpgcom = 0
    # Next try identify(1), more expensive but bulletproof
    if magick:
        (status, output) = commands.getstatusoutput("identify " + filename)
        if status == 0:
            sizes = x_match.search(output)
            if sizes:
                return (sizes.group(1), sizes.group(2))
        elif status == 127:
            sys.stderr.write("imgsizer: giving up on ImageMagick\n")
            magick = 0
    # if that fails, try at _LAST_ resort Python Imaging Library
    # open doesn't actually load all the data, so it shouldn't be too expensive
    if pythonimage:
        try:
           import Image
           pyimg = Image.open(filename)
           return pyimg.size
        except (ImportError, IOError):
           sys.stderr.write("imgsizer: giving up on Python Imaging Library\n")
           pythonimage = 0
           pass

    # All attempts failed
    sys.stderr.write("imgsizer: couldn't analyze %s\n" % src)

source  = re.compile('SRC\s*=\s*"?([^" \t\n]*)"?', re.I)
awidth  = re.compile(r' *WIDTH\s*=\s*"?[0-9]*"?', re.I)
aheight = re.compile(r' *HEIGHT\s*=\s*"?[0-9]*"?', re.I)
pwidth  = re.compile(r'WIDTH\s*=\s*"?[0-9]*%"?', re.I)
pheight = re.compile(r'HEIGHT\s*=\s*"?[0-9]*%"?', re.I)

def transform(attr):
    src = source.search(attr)
    # Must have a source part and no percents in existing width or height
    if not src or pwidth.search(attr) or pheight.search(attr):
        return attr
    if no_overwrite and awidth.search(attr) and aheight.search(attr):
        return attr
    # Correct the url for documentation root, if present
    url = src.group(1)
    if url[0] == '/' and root:
        url = os.path.join(root, url[1:])
    # OK, get the size tuple if possible
    dimensions = imgsize(url)
    if not dimensions:
        return attr
    else:
        # Nuke any old size attr
        if not no_overwrite:
            attr = re.sub(awidth, "", attr)
            attr = re.sub(aheight, "", attr)
        # Compute image dimensions
        (xc, yc) = dimensions
        # Plug in the new attr
        return attr + attrformat(xc, "width") + attrformat(yc, "height") 

# Output uppercase tags, surrounded by quotes, by default.
lower = 0
quotes = 1

# Set the default DocumentRoot to the current working directory.
root = "."

out = "imgsizer-out$$"
dir = "."	# NOTE: if you are doing <yourfile make sure that pwd is correct! 

# Collect options from the environment first, then the command line
options = os.environ.get("IMGSIZER")
if options:
    options = string.split(options)
else:
    options = []
options = options + sys.argv[1:]

# Process options
(options, arguments) = getopt.getopt(options, "Vhd:n",
			     ('version', 'help', 'usage', 'document=', 'no-overwrite'))
no_overwrite = 0
for (switch, val) in options:
    if switch in ('-V', '--version'):
        print splash % version
        raise SystemExit
    elif switch in ('-h', '--help', '--usage'):
        print splash + "\n\n" + usage
        raise SystemExit
    elif switch in ('-d', '--document'):
        root = val
        if not os.path.isdir(root):
            print "Document root isn't a directory"
            raise SystemExit, 1
    elif switch in ('-n', '--no-overwrite'):
        no_overwrite = 1

if not arguments:
    sizefix(sys.stdin, sys.stdout)
else:
    for file in arguments:
        try:
            infp = open(file)
        except:
            print "imgsizer: can't open input file", file
            raise SystemExit, 1
        tempfile = file + ".~imgsizer-%d~" % os.getpid()
        try:
            outfp = open(tempfile, "w")
        except OSError:
            print "imgsizer: can't open tempfile"
            raise SystemExit, 1
        sizefix(infp, outfp)
        if cmp.cmp(file, tempfile):
            os.remove(tempfile)
        else:
            try:
                os.rename(tempfile, file)
            except OSError:
                sys.stderr.write("imgsize: couldn't replace " + file)
                os.remove(tempfile)

# End
