#!/usr/bin/env python # -*- coding=utf-8 -*- # ----------------------------------------------------------------------------- # ODS.PY, Version 0.91, Copyright (c) Jarmo Pietiläinen 2012-2015 # http://z0b.kapsi.fi/ods.php # ----------------------------------------------------------------------------- # Tested with Python 2.7.x. Should work with Python 3 if you change the # encoding stuff in function handle_end(); I don't know how to make this # work out-of-the-box in Python 2 and 3 in both Windows and Linux. # ----------------------------------------------------------------------------- # # This software is provided 'as-is', without any express or implied # warranty. In no event will the authors be held liable for any damages # arising from the use of this software. # # Permission is granted to anyone to use this software for any purpose, # including commercial applications, and to alter it and redistribute it # freely, subject to the following restrictions: # # 1. The origin of this software must not be misrepresented; you must not # claim that you wrote the original software. If you use this software # in a product, an acknowledgment in the product documentation would be # appreciated but is not required. # # 2. Altered source versions must be plainly marked as such, and must not be # misrepresented as being the original software. # # 3. This notice may not be removed or altered from any source # distribution. # # ----------------------------------------------------------------------------- # # Some recent changes: # # 0.91 (2015-07-16): # - Restored the shebang line that somehow got removed. # # 0.9 (2015-07-13): # - Added an accumulator variable that collects the cell values from multiple # XML nodes. This is because expat (the XML parser we use) doesn't return # certain strings in one piece, but splits them into multiple parts (for # example, "this & that" is returned as "this ", "&", " that"). There's no # way to turn this "feature" off, so we must deal with it in the hard way. # - The ODS spec allows multiple adjoining cells with identical contents to be # merged into one cell definition. This is now (hopefully) correctly parsed. # # ----------------------------------------------------------------------------- """ Extremely simple ODS (OpenDocument Spreadsheet) data extractor. Loads ODS files, extracts the content.xml file and parses it. Returns a dict of sheets (keys are UTF-8 encoded sheet names). Each sheet contains a dict of rows and each row is a dict of cells. Cell data is stored in UTF-8. Row and column numbers (and dict keys) are one-based; cell A1 is (1, 1), cell X48 is (18, 48) and so on. This is a barebones loader. You are supposed to extend and modify it to suit to your needs! Simple usage example: sheets = load_ods("file.ods") for sheet in sheets.items(): print("Sheet \"%s\"" % (sheet[0])) for row, rdata in sheet[1].items(): print(" Row %d" % (row)) for col, cdata in rdata.items(): print(" Column %d: \"%s\"" % (col, cdata)) """ import zipfile, xml.parsers.expat class ods_loader: """XML parser callback functions. Extracts the cell contents and builds the nested dicts.""" # ----------------------------------------------------- # helper for getting integer values from dicts, with default value # (dict.get() is slow) def get_attribute(self, name, where, d=0): if name in where: return int(where[name]) return d # ----------------------------------------------------- def __init__(self): # are we in a cell? self.tc = False # current sheet name (UTF-8) self.sheet = "" # table row number, table cell number (1-based, not 0-based!) self.row = 1 self.col = 1 # row and column skip/repeat counters self.row_delta = 1 self.col_repeat = 0 self.col_span = 1 # active rowspans self.spans = [] # output is stored here self.sheets = {} # cell content accumulator (expat splits the values into multiple nodes in certain cases) self.value_accumulator = "" # ----------------------------------------------------- def handle_start(self, name, attrs): # start a new sheet if name == "table:table": if not "table:name" in attrs: raise RuntimeError("missing sheet name") self.sheet = attrs["table:name"].encode("utf-8") # start a table row if name == "table:table-row": self.col = 1 self.col_repeat = 0 self.col_span = 1 # what number must be added to the row counter after this row # to get the next row's number? self.row_delta = self.get_attribute("table:number-rows-repeated", attrs, 1) # start a table cell if name == "table:table-cell": # handle active rowspans (skip over columns that span multiple rows) for s in self.spans: if (self.col == s[2]) and (self.row > s[0]) and (self.row < s[1]): self.col += s[3] self.tc = True # Two attributes control column repeat and spanning: # # table:number-columns-repeated: how many times this cell must be repeated # table:number-columns-spanned: how many adjacent cells have been merged into one # # These two are NOT mutually exclusive! They both can be used at the same time. # NOTE: We could also count the "table:covered-table-cell" elements to handle # the column numbers when skipping merged columns. self.col_repeat = self.get_attribute("table:number-columns-repeated", attrs, 1) self.col_span = self.get_attribute("table:number-columns-spanned", attrs, 0) # Row spans are hairier. Basically, we must store them somewhere and use # them to compute the column numbers on other rows, because spanned rows # create columns that "don't exist", but still must be accounted for. row_span = self.get_attribute("table:number-rows-spanned", attrs) # rowspans and colspans go hand-in-hand, so don't store one-row rowspans if row_span > 1: self.spans.append((self.row, self.row + row_span, self.col, self.col_span)) # Sort the spans by their column number, otherwise spanning won't work. # Consider this example: # # A B C # +----+ # 1 | RA | # +----+ + # 2 | RB | | # + + + # 3 | | | # +----+----+ # # Here, rowspan RA is declared before RB, but because RA's column number (2) # won't match before we've skipped span RB (1), the cell numbers in column C # will be off by one. So we sort the spans by the column number; now we skip # RB first, then RA and so the column C will have the correct number (3). self.spans.sort(key = lambda c: c[2]) self.value_accumulator = "" # other useful attributes you might want to handle here: # # "office:value-type" for the cell data type # "office:currency" for the currency type # "table:formula" for raw cell formulas # "office:value", "office:date-value", "office:time-value" (and others) for # the "internal" data, the actual data the program handles; it is separate # from the displayed value (which is what this script extracts for you) # ----------------------------------------------------- def handle_end(self, name): # end a sheet if name == "table:table": self.row = 1 self.col = 1 # end a table row if name == "table:table-row": self.row += self.row_delta # remove rowspans that no longer apply after this row self.spans = [s for s in self.spans if self.row < s[1]] # end a table cell if name == "table:table-cell": self.col_span = max(self.col_span, 1) # if we have a value, store it if self.tc and len(self.value_accumulator) > 0: # create the current sheet if not self.sheet in self.sheets: self.sheets[self.sheet] = {} # create the current row in the current sheet if not self.row in self.sheets[self.sheet]: self.sheets[self.sheet][self.row] = {} # finally create the cell, repeating the contents if needed col = self.col for n in range(0, self.col_repeat): self.sheets[self.sheet][self.row][col] = self.value_accumulator.encode("utf-8") col += self.col_span self.col += self.col_repeat * self.col_span self.col_repeat = 0 self.col_span = 0 self.value_accumulator = "" self.tc = False # ----------------------------------------------------- def handle_data(self, data): # just keep collecting data until the cell ends if self.tc: self.value_accumulator += data # main parser function, call this def load_ods(filename): """Loads an ODS (Open Document Spreadsheet) file and extracts its contents.""" # exract content.xml from the ZIP and parse it content = zipfile.ZipFile(filename, "r").read("content.xml") storage = ods_loader() parser = xml.parsers.expat.ParserCreate() parser.StartElementHandler = storage.handle_start parser.EndElementHandler = storage.handle_end parser.CharacterDataHandler = storage.handle_data parser.Parse(content, True) return storage.sheets # usage example if __name__ == "__main__": import sys if len(sys.argv) != 2: quit("Usage: ods.py ") sheets = load_ods(sys.argv[1]) # dump the cell data for sheet in sheets.items(): print("Sheet \"%s\"" % (sheet[0])) for row_num, row_data in sheet[1].items(): print(" Row %d" % (row_num)) for col_num, col_data in row_data.items(): print(" Column %d: \"%s\"" % (col_num, col_data))