Re: Do you mind if the HTML is a pile of dung?

...for a larger application. Hack as needed/desired. The "SaveAs type 10" is one of the more important bits--"filtered" html.
\nimport win32com.client\nimport os\nimport re\n\nclass Converter(object):\n    """Convert plain text documents to Junct Topics."""\n    def __init__(self, fileName):\n        self.fileName = fileName\n    \n    def toTopic(self):\n        return u''.join([unicode(line, "windows-1252", "replace")\n                         for line in file(self.fileName, 'rU')])\n\n\nclass WordDocument(Converter):\n    """Convert Microsoft Word documents to Junct Topics."""\n    \n    def toTopic(self):\n        htmlFile = self.fileName.split(u'.')\n        htmlFile = u'.'.join(htmlFile[:-1] + ['htm'])\n        \n        # Convert the doc to filtered html\n        app = win32com.client.Dispatch('Word.Application')\n        doc = app.Documents.Add(self.fileName)\n        doc.SaveAs(htmlFile, 10)    # 10 == HTML-Filtered\n        doc.Close(0)                #  0 == don't save changes?\n        app.Quit()\n        \n        # Read in the new html file.\n        content = u''.join([unicode(line, "windows-1252", "replace")\n                            for line in file(htmlFile, 'rU')])\n        \n        # Grab the body element and strip out HTML cruft.\n        content = re.sub(r"\\r\\n", r'\\n', content)\n        content = re.sub(r'(?s)^.*<body[^>]*>(.*)</body>.*$', r'\\1', content)\n        content = re.sub(r"style='[^']*'", r'', content)\n        content = re.sub(r"class=Mso[^>]*", r'', content)\n        content = re.sub(r"<div [^>]*>", r'', content)\n        content = re.sub(r"</div>", r'', content)\n        content = re.sub(r"\\n", r' ', content)\n        \n        # Delete the intermediate file.\n        try:\n            os.remove(htmlFile)\n        except OSError:\n            pass\n        \n        return content\n
Welcome to IWETHEY!