Excel Perl XML

2004-09-15

Excel, at least as of version 2002, will save directly to XML. When you pick save just choose the xml format. From that point on, you can edit that XML file directly in Excel and when you re−save Excel will continue to save it as XML. The point is, you should almost NEVER EVER use xls files EVER because XLS files are not usable by anything but Excel. Since XML files are usable by both Excel and other software you'll only confuse yourself and others if you keep using xls files. In fact, as of Office 2003 Windows will open Excel created XML files in Excel by default. Even better.

Here's some perl to parse an Excel XML file.

#
use strict;
use warnings;

use XML::Simple;
use Data::Dumper;
use Encode;

my inFilename  = ARGV[0];
my outFilename = ARGV[1];
if (!defined(inFilename) || !defined(outFilename))
{
   die ("usage: xlReader.pl infilename outfilename [encoding]\n");
}

my g_encoding = ARGV[2];
if (!defined(g_encoding))
{
   g_encoding = "utf8";
}

my xml = XMLin(inFilename,
   ForceArray => 1
   );

my xl = { };

# build hash of rows indexed by rowNumber
# of hash of cells indexed by columnNumber
my lastRow    = 0;  # remember the bottom most row we find
my lastColumn = 0;  # remember the right most column we find
{
   my currentRow = 0;

   my rows = xml->{'Worksheet'}->[0]->{'Table'}->[0]->{'Row'};

   foreach my row (@{rows})
   {
      # figure out the row's index
      my rowIndex = currentRow + 1;
      if (defined (row->{'ss:Index'}))
      {
         rowIndex = row->{'ss:Index'};
      }

      # track last row
      if (rowIndex > lastRow) { lastRow = rowIndex; }

      # assert if we already have this row
      if (defined (xl->{rowIndex}))
      {
        die ("row rowIndex already defined!\n");
      }

      # make a new empty hash for this row
      xl->{rowIndex} = { };

      my rowHash = xl->{rowIndex};
      my currentCol = 0;
      my cells = row->{'Cell'};
      foreach my cell (@{cells})
      {
         # figure out the column index
         my colIndex = currentCol + 1;
         if (defined (cell->{'ss:Index'}))
         {
            colIndex = cell->{'ss:Index'}
         }

         # assert if we already have this column in this row
         if (defined(rowHash->{colIndex}))
         {
            die "column colIndex in row rowIndex already defined!\n";
         }

         # add the cell here
         if (defined (cell->{'Data'}))
         {
            rowHash->{colIndex} = cell->{'Data'}->[0];

            # track lastCol
            if (colIndex > lastColumn) { lastColumn = colIndex; }
         }

         currentCol = colIndex;

         # if a cell is used across multiple cells
         # copy the contents to all cells it covers
         if (defined(cell->{'ss:MergeAcross'}))
         {
            currentCol += cell->{'ss:MergeAcross'};
            if (defined (cell->{'Data'}))
            {
               for (my ii = 1; ii <= cell->{'ss:MergeAcross'}; ii++)
               {
                  rowHash->{colIndex + ii} = cell->{'Data'}->[0];
               }
            }
         }
      }

      currentRow = rowIndex;
   }
}

#print Dumper (xl);

# at this point we have a hash of rows of columns so we can access any cell
# in the spreadsheet by calling GetCellValue(xl, row, col);
# GetCellValue may return undef if that cell does not exist
# You can also call GetCellValueCheck which will fail if the cell does not
# exist
#
# In the following example we will generate a text file in the following these rules
# 1) Row 1 is used to mark the start ( s ) and end ( e ) of what you want converted
#    You can also designate a label column ( l ). If they are same columns you can
#    specify them with a comma as in "s,l"
# 2) Row 2 is a description
# 3) Row 3 is the field name, if it is only 1 or 2 characters the description (row 1)
#    will be prefixed
# 4) Row 4 is the default row. Any value not appearing below will default to the
#    value in row 4
# 5) Actual data stars in row 5
#
#

my FILE = IO::Handle->new();
my startColumn;
my endColumn;
my labelColumn;

# find start, end and label columns
for (my colIndex = 1; colIndex < 300; colIndex++)
{
   my value = GetCellValue(xl, 1, colIndex);
   if (defined(value))
   {
#      print "value=(value)\n";
      if (value =~ /(?:^|,)e(?:,|)/i)
      {
         endColumn = colIndex;
         last;
      }
      if (value =~ /(?:^|,)s(?:,|)/i)
      {
         startColumn = colIndex;
      }
      if (value =~ /(?:^|,)l(?:,|)/i)
      {
         labelColumn = colIndex;
      }
   }
}

if (!defined(startColumn)) { die ("could not find start column\n"); }
if (!defined(endColumn)) { die ("could not find end column\n"); }

open (FILE, ">" . outFilename) || die ("could not open file outFilename\n");

# build stage data
for (my rowIndex = 5; rowIndex <= lastRow; rowIndex++)
{
   print FILE "#-------------------------------------------------------\n";
   if (defined(labelColumn))
   {
      my label = GetCellValueCheck(xl, rowIndex, labelColumn);
      print FILE "label = ", perl2output(label), "\n";
   }

   for (my colIndex = startColumn; colIndex <= endColumn; colIndex++)
   {
      # for each column:
      # grab description in row 2
      # grab label in row 3
      # grab default from row 4
      # use default if value does not exist
      my description = GetCellValue(xl, 2, colIndex);
      my columnLabel = GetCellValueCheck(xl, 3, colIndex);
      my default     = GetCellValueCheck(xl, 4, colIndex);

      my value = GetCellValue(xl, rowIndex, colIndex);

      if (!defined(value)) { value = default; }

      # --- if the column label is too short add the cell above it
      if (length(columnLabel) < 3)
      {
         columnLabel = description . columnLabel;
      }

      # --- exceptions ---
      if (!defined (description)) { description = "-???-"; }

      print FILE perl2output(columnLabel), " = ", value,
        " # ", perl2output(description), "\n";
   }

   print FILE "\n";
}

close FILE;
exit 0;

sub GetCellValueCheck
{
   my (xl, row, col) = @_;

   my value = GetCellValue(xl, row, col);
   if (!defined (value))
   {
      die "required cell " . ToExcelColumn(col) . "row not defined!\n";
   }

   return value;
}

sub GetCellValue
{
   my (xl, row, col) = @_;

   if (defined(xl->{row}) && defined(xl->{row}->{col}))
   {
      return xl->{row}->{col}->{'content'};
   }

   return undef;
}

sub ToExcelColumn
{
   my (col) = @_;

   if (col <= 26) { return chr(col+64); }

   return chr((col - 1 / 26) + 65) . chr(((col - 1) % 26) + 65);
}

sub FromExcelColumn
{
   my (col) = @_;

   my colNdx = 0;
   for (my cc = 0; cc < length(col); cc++)
   {
      colNdx = colNdx * 26 + ord(uc(substr(col,cc,1))) - 65;
   }

   return colNdx + 1;
}

sub perl2output
{
   my (str) = @_;

   return encode(g_encoding, str);
}

Just as an example, given a spreadsheet that looks like this

l s e
? ? ? Color
Name Hitpoints Icon R G B
default label1 wimpy255255255
ogre10 wimpy20015010
orc15 wimpy20716015
dragon200 strong ?200123
beholder150 strong ?5012
snake5 ? ? ? ?

The perl above will produce this file

#-------------------------------------------------------
label = label
Hitpoints = 1 # -???-
Icon = wimpy # -???-
ColorR = 255 # Color
ColorG = 255 # Color
ColorB = 255 # Color

#-------------------------------------------------------
label = ogre
Hitpoints = 10 # -???-
Icon = wimpy # -???-
ColorR = 200 # Color
ColorG = 150 # Color
ColorB = 10 # Color

#-------------------------------------------------------
label = orc
Hitpoints = 15 # -???-
Icon = wimpy # -???-
ColorR = 207 # Color
ColorG = 160 # Color
ColorB = 15 # Color

#-------------------------------------------------------
label = dragon
Hitpoints = 200 # -???-
Icon = strong # -???-
ColorR = 255 # Color
ColorG = 200 # Color
ColorB = 123 # Color

#-------------------------------------------------------
label = beholder
Hitpoints = 150 # -???-
Icon = strong # -???-
ColorR = 255 # Color
ColorG = 50 # Color
ColorB = 12 # Color

#-------------------------------------------------------
label = snake
Hitpoints = 5 # -???-
Icon = wimpy # -???-
ColorR = 255 # Color
ColorG = 255 # Color
ColorB = 255 # Color
Comments
Effective 3D Exporter Design: How to Make Artists Love You
The Economics of making games