I wrote a Python Word->HTML converter in Feb
...for a larger application. Hack as needed/desired. The "SaveAs type 10" is one of the more important bits--"filtered" html.
\nimport win32com.client\nimport os\nimport re\n\nclass Converter(object):\n """Convert plain text documents to Junct Topics."""\n def __init__(self, fileName):\n self.fileName = fileName\n \n def toTopic(self):\n return u''.join([unicode(line, "windows-1252", "replace")\n for line in file(self.fileName, 'rU')])\n\n\nclass WordDocument(Converter):\n """Convert Microsoft Word documents to Junct Topics."""\n \n def toTopic(self):\n htmlFile = self.fileName.split(u'.')\n htmlFile = u'.'.join(htmlFile[:-1] + ['htm'])\n \n # Convert the doc to filtered html\n app = win32com.client.Dispatch('Word.Application')\n doc = app.Documents.Add(self.fileName)\n doc.SaveAs(htmlFile, 10) # 10 == HTML-Filtered\n doc.Close(0) # 0 == don't save changes?\n app.Quit()\n \n # Read in the new html file.\n content = u''.join([unicode(line, "windows-1252", "replace")\n for line in file(htmlFile, 'rU')])\n \n # Grab the body element and strip out HTML cruft.\n content = re.sub(r"\\r\\n", r'\\n', content)\n content = re.sub(r'(?s)^.*<body[^>]*>(.*)</body>.*$', r'\\1', content)\n content = re.sub(r"style='[^']*'", r'', content)\n content = re.sub(r"class=Mso[^>]*", r'', content)\n content = re.sub(r"<div [^>]*>", r'', content)\n content = re.sub(r"</div>", r'', content)\n content = re.sub(r"\\n", r' ', content)\n \n # Delete the intermediate file.\n try:\n os.remove(htmlFile)\n except OSError:\n pass\n \n return content\n