#! /usr/bin/env python3 from m_lib.net.www.html import HTMLParser as _HTMLParser class HTMLHeadDone(Exception): pass class HTMLParser(_HTMLParser): def end_head(self): raise HTMLHeadDone() def do_meta(self, attrs): http_equiv = "" content = "" for attrname, value in attrs: if value: value = value.strip() if attrname == 'http-equiv': http_equiv = value.lower() elif attrname == 'content': content = value if http_equiv == "content-type": try: # extract charset from "text/html; foo; charset=UTF-8; bar;" self.charset = content.lower().split('charset=')[1].split(';')[0] except IndexError: pass raise HTMLHeadDone() def parse_html(filename): infile = open(filename, 'r') parser = HTMLParser() for line in infile: try: parser.feed(line) except HTMLHeadDone: break infile.close() try: parser.close() except HTMLHeadDone: pass if hasattr(parser, "charset"): parser.charset = parser.charset.replace("windows-", "cp").lower() return parser if __name__ == '__main__': try: import sys parser = parse_html(sys.argv[1]) if hasattr(parser, "charset"): print(parser.charset) else: import chardet charset = chardet.detect(open(sys.argv[1]).read())["encoding"] if charset in ("ISO-8859-2", "MacCyrillic"): charset = "cp1251" print(charset) except: pass