#!/usr/bin/env python
# -*- coding=utf-8 -*-
# -----------------------------------------------------------------------------
# ODS.PY, Version 0.91, Copyright (c) Jarmo Pietiläinen 2012-2015
# http://z0b.kapsi.fi/ods.php
# -----------------------------------------------------------------------------
# Tested with Python 2.7.x. Should work with Python 3 if you change the
# encoding stuff in function handle_end(); I don't know how to make this
# work out-of-the-box in Python 2 and 3 in both Windows and Linux.
# -----------------------------------------------------------------------------
#
# This software is provided 'as-is', without any express or implied
# warranty. In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
#   1. The origin of this software must not be misrepresented; you must not
#      claim that you wrote the original software. If you use this software
#      in a product, an acknowledgment in the product documentation would be
#      appreciated but is not required.
#
#   2. Altered source versions must be plainly marked as such, and must not be
#      misrepresented as being the original software.
#
#   3. This notice may not be removed or altered from any source
#      distribution.
#
# -----------------------------------------------------------------------------
#
# Some recent changes:
#
#   0.91 (2015-07-16):
#     - Restored the shebang line that somehow got removed.
#
#   0.9 (2015-07-13):
#     - Added an accumulator variable that collects the cell values from multiple
#       XML nodes. This is because expat (the XML parser we use) doesn't return
#       certain strings in one piece, but splits them into multiple parts (for
#       example, "this & that" is returned as "this ", "&", " that"). There's no
#       way to turn this "feature" off, so we must deal with it in the hard way.
#     - The ODS spec allows multiple adjoining cells with identical contents to be
#       merged into one cell definition. This is now (hopefully) correctly parsed.
#
# -----------------------------------------------------------------------------

"""
Extremely simple ODS (OpenDocument Spreadsheet) data extractor. Loads ODS files,
extracts the content.xml file and parses it. Returns a dict of sheets (keys are
UTF-8 encoded sheet names). Each sheet contains a dict of rows and each row is a
dict of cells. Cell data is stored in UTF-8. Row and column numbers (and dict keys)
are one-based; cell A1 is (1, 1), cell X48 is (18, 48) and so on.

This is a barebones loader. You are supposed to extend and modify it to suit to
your needs!

Simple usage example:

sheets = load_ods("file.ods")

for sheet in sheets.items():
    print("Sheet \"%s\"" % (sheet[0]))

    for row, rdata in sheet[1].items():
        print("  Row %d" % (row))

        for col, cdata in rdata.items():
            print("    Column %d: \"%s\"" % (col, cdata))
"""

import zipfile, xml.parsers.expat

class ods_loader:
    """XML parser callback functions. Extracts the cell contents and builds the nested dicts."""

    # -----------------------------------------------------
    # helper for getting integer values from dicts, with default value
    # (dict.get() is slow)
    def get_attribute(self, name, where, d=0):
        if name in where:
            return int(where[name])

        return d

    # -----------------------------------------------------
    def __init__(self):
        # are we in a cell?
        self.tc = False

        # current sheet name (UTF-8)
        self.sheet = ""

        # table row number, table cell number (1-based, not 0-based!)
        self.row = 1
        self.col = 1

        # row and column skip/repeat counters
        self.row_delta = 1
        self.col_repeat = 0
        self.col_span = 1

        # active rowspans
        self.spans = []

        # output is stored here
        self.sheets = {}

        # cell content accumulator (expat splits the values into multiple nodes in certain cases)
        self.value_accumulator = ""


    # -----------------------------------------------------
    def handle_start(self, name, attrs):
        # start a new sheet
        if name == "table:table":
            if not "table:name" in attrs:
                raise RuntimeError("missing sheet name")

            self.sheet = attrs["table:name"].encode("utf-8")

        # start a table row
        if name == "table:table-row":
            self.col = 1
            self.col_repeat = 0
            self.col_span = 1

            # what number must be added to the row counter after this row
            # to get the next row's number?
            self.row_delta = self.get_attribute("table:number-rows-repeated", attrs, 1)

        # start a table cell
        if name == "table:table-cell":
            # handle active rowspans (skip over columns that span multiple rows)
            for s in self.spans:
                if (self.col == s[2]) and (self.row > s[0]) and (self.row < s[1]):
                    self.col += s[3]

            self.tc = True

            # Two attributes control column repeat and spanning:
            #
            #   table:number-columns-repeated: how many times this cell must be repeated
            #   table:number-columns-spanned: how many adjacent cells have been merged into one
            #
            # These two are NOT mutually exclusive! They both can be used at the same time.

            # NOTE: We could also count the "table:covered-table-cell" elements to handle
            # the column numbers when skipping merged columns.

            self.col_repeat = self.get_attribute("table:number-columns-repeated", attrs, 1)
            self.col_span = self.get_attribute("table:number-columns-spanned", attrs, 0)

            # Row spans are hairier. Basically, we must store them somewhere and use
            # them to compute the column numbers on other rows, because spanned rows
            # create columns that "don't exist", but still must be accounted for.
            row_span = self.get_attribute("table:number-rows-spanned", attrs)

            # rowspans and colspans go hand-in-hand, so don't store one-row rowspans
            if row_span > 1:
                self.spans.append((self.row, self.row + row_span, self.col, self.col_span))

                # Sort the spans by their column number, otherwise spanning won't work.
                # Consider this example:
                #
                #       A     B     C
                #           +----+
                #   1       | RA |
                #      +----+    +
                #   2  | RB |    |
                #      +    +    +
                #   3  |    |    |
                #      +----+----+
                #
                # Here, rowspan RA is declared before RB, but because RA's column number (2)
                # won't match before we've skipped span RB (1), the cell numbers in column C
                # will be off by one. So we sort the spans by the column number; now we skip
                # RB first, then RA and so the column C will have the correct number (3).
                self.spans.sort(key = lambda c: c[2])

            self.value_accumulator = ""

            # other useful attributes you might want to handle here:
            #
            #  "office:value-type" for the cell data type
            #  "office:currency" for the currency type
            #  "table:formula" for raw cell formulas
            #  "office:value", "office:date-value", "office:time-value" (and others) for
            #     the "internal" data, the actual data the program handles; it is separate
            #     from the displayed value (which is what this script extracts for you)


    # -----------------------------------------------------
    def handle_end(self, name):
        # end a sheet
        if name == "table:table":
            self.row = 1
            self.col = 1

        # end a table row
        if name == "table:table-row":
            self.row += self.row_delta

            # remove rowspans that no longer apply after this row
            self.spans = [s for s in self.spans if self.row < s[1]]

        # end a table cell
        if name == "table:table-cell":
            self.col_span = max(self.col_span, 1)

            # if we have a value, store it
            if self.tc and len(self.value_accumulator) > 0:
                # create the current sheet
                if not self.sheet in self.sheets:
                    self.sheets[self.sheet] = {}

                # create the current row in the current sheet
                if not self.row in self.sheets[self.sheet]:
                    self.sheets[self.sheet][self.row] = {}

                # finally create the cell, repeating the contents if needed
                col = self.col

                for n in range(0, self.col_repeat):
                    self.sheets[self.sheet][self.row][col] = self.value_accumulator.encode("utf-8")
                    col += self.col_span

            self.col += self.col_repeat * self.col_span
            self.col_repeat = 0
            self.col_span = 0
            self.value_accumulator = ""
            self.tc = False


    # -----------------------------------------------------
    def handle_data(self, data):
        # just keep collecting data until the cell ends
        if self.tc:
            self.value_accumulator += data


# main parser function, call this
def load_ods(filename):
    """Loads an ODS (Open Document Spreadsheet) file and extracts its contents."""
    # exract content.xml from the ZIP and parse it
    content = zipfile.ZipFile(filename, "r").read("content.xml")

    storage = ods_loader()
    parser = xml.parsers.expat.ParserCreate()
    parser.StartElementHandler = storage.handle_start
    parser.EndElementHandler = storage.handle_end
    parser.CharacterDataHandler = storage.handle_data
    parser.Parse(content, True)

    return storage.sheets


# usage example
if __name__ == "__main__":
    import sys

    if len(sys.argv) != 2:
        quit("Usage: ods.py <ODS file>")

    sheets = load_ods(sys.argv[1])

    # dump the cell data
    for sheet in sheets.items():
        print("Sheet \"%s\"" % (sheet[0]))

        for row_num, row_data in sheet[1].items():
            print("  Row %d" % (row_num))

            for col_num, col_data in row_data.items():
                print("    Column %d: \"%s\"" % (col_num, col_data))