用python写一个将word转为html的本地可执行程序

2019-10-21

目标：一键批量把 word 转为 html
优势：可嵌入到项目里，通过命令执行；可本地直接双击执行；可定制化；

实现思路：

通过 pywin32 调用本地的 word 程序把 .docx 格式的文档转存为筛选过的网页（.htm,.html）
用 html.parser 模块解析读取的html, 通过 re 模块的正则替换删除冗余内容，通过 bs4 模块对 html 进行其他优化处理
用 codecs 模块保存优化后的 html
用 pyinstaller(或cxfreeze, pyinstaller打包出的可执行程序更稳定) 将优化后的 html 打包成 word2html.exe 可执行内容
将打包后的内容放在项目目录下，并在项目目录下的 package.json 的 scripts 里配置 start word2html.exe，使程序可在项目中运行

准备工作：

0. python 学习
python文档
 廖雪峰Python教程

1. 搭建 python 环境
window 环境下去官网下载与电脑位数匹配的 python 版本，如下，我下载的是64位的 python 3.7.5
This is an example image

命令行输入 where python 查询 python 安装目录并配置环境变量
This is an example image

2. 下载 PyCharm 方便编写 python 程序

3. 下载安装第三方模块 Python Packaging User Guide
pywin32
注意点：要下载python对应，与电脑位数64位相同的版本，如我下载的是：64位的，对应python3.7版本的
This is an example image

Python 正则表达式，安装 re：pip install re
Beautiful Soup
cchardet

4. 安装打包插件 cx-Freeze
pip install cx-Freeze[==版本号]
cx_freeze安装参考

5. 代码实现
word2html

word转htmlview raw

# -*- coding: utf-8 -*-
import codecs
from win32com import client as wc
import re
import chardet
from bs4 import BeautifulSoup, Comment
import os


def init():
    path = os.getcwd()  # 文件夹目录
    files = os.listdir(path)  # 得到文件夹下的所有文件名称
    html_dir = os.path.join(path, 'html')
    try:
        os.mkdir(html_dir)
    except FileExistsError as e:
        print(e)
    for file in files:  # 遍历文件夹
        if os.path.isfile(file) and file.split('.')[1] == 'docx':
            main(path, html_dir, file)


def main(word_dir, html_dir, word_name):
    html_name = word_name.split('.')[0] + '.html'
    # html_name1 = word_name.split('.')[0] + '1.html'
    html = word_to_html(os.path.join(word_dir, word_name), os.path.join(html_dir, html_name))
    res = format_html(html)
    html = my_beautiful_soup(res)

    html = re.sub(r'<strong>\s{0,}</strong>', r'', html)  # 空的strong标签
    html = re.sub(r'&nbsp;{2,}', r'&nbsp;', html)  # 去空格
    html = re.sub(r'\s+', r' ', html)  # 多个空格合并成一个

    save_file(html, os.path.join(html_dir, html_name))


# doc_path - word 文档地址 export_path - 导出的 html 地址
def word_to_html(doc_path, export_path):
    try:
        word = wc.Dispatch('Word.Application')
        doc = word.Documents.Open(doc_path)
        doc.SaveAs(export_path, 10)
    finally:
        if('doc' in dir()) and doc.Close:
            doc.Close()
        if('word' in dir()) and word.Quit:
            word.Quit()
    f = open(export_path, 'r')
    str = f.read()
    f.close()
    return str


# 格式化html
# res - 导出的 html
def format_html(res):
    res = re.sub(r'<html.*?>', r'<!DOCTYPE html>', res)
    res = re.sub(r'\s+', r' ', res)  # 多个空格合并成一个
    res = re.sub(r'\s+>', r'>', res)  # 标签结尾的空格
    res = re.sub(r'>\s+<', r'><', res)  # 去除标签之间的空格
    res = re.sub(r'<font.*?>(.*?)<\/font>', r'\1', res)  # 去除<font></font>
    res = re.sub(r'<ins.*?<\/ins>', r'', res)  # 去除<ins></ins>
    res = re.sub(r'<u>(.*?)<\/u>', r'\1', res)  # 去除<u></u>
    res = re.sub(r'<o(.*?)<\/o.*?>', r'', res)  # 去除<o:p></o:p>
    res = re.sub(r'<b>(.*?)<\/b>', r'<strong>\1</strong>', res)
    return res

def my_beautiful_soup(html):
    soup = BeautifulSoup(html, "html5lib")

    head_tag = soup.new_tag('head')
    meta1 = soup.new_tag('meta', attrs={'name': 'viewport', 'content': 'width=device-width,initial-scale=1,maximum-scale=1,minimum-scale=1,user-scalable=no'})
    meta2 = soup.new_tag('meta', attrs={'content': 'text/html', 'charset': 'utf-8', 'http-equiv': 'Content-Type'})
    title = soup.new_tag('title')
    head_tag.append(meta1)
    head_tag.append(meta2)
    head_tag.append(title)
    soup.head.replace_with(head_tag)


    # 遍历节点
    for tag in soup.find_all(True):
        # 删除 class
        del tag['class']

        if not tag.get('style') is None:
            tag_style = tag['style']
            del tag['style']  # 删除元素的 style
            # 增加加粗样式
            # if ('font-weight:bold' in tag_style) and (not tag.string is None):
            #     strong_tag = soup.new_tag("strong")
            #     if 'text-decoration:underline' in tag_style:
            #         strong_tag['style'] = 'text-decoration:underline;'
            #     strong_tag.string = tag.string
            #     tag.replace_with(strong_tag)
            # 增加下划线样式
            if 'text-decoration:underline' in tag_style and (not tag.string is None):
                u_tag = soup.new_tag("u")
                u_tag['style'] = 'text-decoration:underline;'
                u_tag.string = tag.string
                tag.replace_with(u_tag)
            # 增加居中样式
            if 'text-align:center' in tag_style:
                tag['style'] = 'text-align:center;'
            # 增加右对齐样式
            elif 'text-align:right' in tag_style:
                tag['style'] = 'text-align:right;'

        # 合并相同的标签
        # if (not tag.previous_sibling is None) and (not tag.previous_sibling.string is None) and (tag.previous_sibling.name == tag.name) and (tag.previous_sibling.get('style') == tag.get('style')) and (not tag.string is None):
        #     tag.string.insert_before(tag.previous_sibling.string)
        #     if not tag.previous_sibling.name is None:
        #         tag.previous_sibling.decompose()

    # 删除span标签
    spans = soup.find_all('span')
    for span in spans:
        span.unwrap()

    # 删除i标签
    i_tags = soup.find_all('i')
    for i_tag in i_tags:
        i_tag.unwrap()

    for tag in soup.find_all(True):
        # 合并相同的标签
        if (not tag.previous_sibling is None) and (not tag.previous_sibling.string is None) and (
                tag.previous_sibling.name == tag.name) and (tag.previous_sibling.get('style') == tag.get('style')) and (
        not tag.string is None):
            tag.string.insert_before(tag.previous_sibling.string)
            if not tag.previous_sibling.name is None:
                tag.previous_sibling.decompose()

    # 删除注释
    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()

    return str(soup).encode().decode('utf-8')

# 保存格式化好的文件
def save_file(res, html_path):
    with codecs.open(html_path, 'w+', 'utf-8') as out:
        out.write(res)
        out.close()

init()

另一个版本的:

word转htmlview raw

# -*- coding: utf-8 -*-
import codecs
from win32com import client as wc
import re
import chardet
from bs4 import BeautifulSoup
import os
from html.parser import HTMLParser
globalClsList = []

def init():
    path = os.getcwd()  # 文件夹目录
    files = os.listdir(path)  # 得到文件夹下的所有文件名称
    html_dir = os.path.join(path, 'html')
    try:
        os.mkdir(html_dir)
    except FileExistsError as e:
        print(e)
    for file in files:  # 遍历文件夹
        if os.path.isfile(file) and file.split('.')[1] == 'docx':
            main(path, html_dir, file)

def main(word_dir, html_dir, word_name):
    html_name = word_name.split('.')[0] + '.html'
    html_name1 = word_name.split('.')[0] + '1.html'
    html = word_to_html(os.path.join(word_dir, word_name), os.path.join(html_dir, html_name))
    res = format_html(html)
    parser = MyHTMLParser()
    parser.feed(res)
    res = str(my_beautiful_soup(res))
    # print(len(globalClsList))
    cls_list_text = ''
    for cls in globalClsList:
        # print(cls)
        cls_text = '.cls%s{%s}'%(globalClsList.index(cls), cls)
        cls_list_text = cls_list_text + cls_text
    # print(cls_list_text)
    cls_list_text = '<style>' + cls_list_text + '</style></head>'
    res = re.sub(r'(</head>)', cls_list_text, res)
    save_file(res, os.path.join(html_dir, html_name1))

# doc_path - word 文档地址 export_path - 导出的 html 地址
def word_to_html(doc_path, export_path):
    try:
        word = wc.Dispatch('Word.Application')
        doc = word.Documents.Open(doc_path)
        doc.SaveAs(export_path, 10)
    finally:
        if('doc' in dir()) and doc.Close:
            doc.Close()
        if('word' in dir()) and word.Quit:
            word.Quit()
    f = open(export_path, 'r')
    str = f.read()
    f.close()
    return str

def format_html(res):
    return res

# 格式化html
# res - 导出的 html
def format_html1(res):
    res = re.sub(r'charset=[\w\d\-]+', 'charset=utf-8', res)

    code_type = chardet.detect(res.encode('utf-8'))['encoding']
    try:
        res = res.encode('utf-8').decode(code_type)
    except UnicodeDecodeError:
        res = res.decode('gbk')

    head = r'''<head>
    <meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,minimum-scale=1,user-scalable=no">
    <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
    </head>
    '''
    # 去除默认生成的 style 标签
    res = re.sub(r'<style>[\s\S]*<\/style>', r'', res)
    # 去除默认生成的 xml 标签
    res = re.sub(r'<xml>[\s\S]*<\/xml>', r'', res)
    res = re.sub(code_type, r'utf-8', res) # 改编码
    # 添加meta标签
    res = re.sub(r'<head>.*?<\/head>', head, res)
    # 去掉行间样式 style
    # res = re.sub(u'style=[\w\d;\-_.\.\s;,:#\"\'%\u4e00-\u9fa5]+>', '>', res)

    # 去空 - 格式化
    # res = re.sub(r'&nbsp;', r'', res)  # 去空格
    res = re.sub(r'\s+', r' ', res)  # 多个空格合并成一个
    res = re.sub(r'\s+>', r'>', res)  # 标签结尾的空格
    res = re.sub(r'<span>\s{0,}<\/span>', r'', res)  # 空的span标签
    res = re.sub(r'>\s+<', r'><', res)  # 去除标签之间的空格
    res = re.sub(r'<font.*?>(.*?)<\/font>', r'\1', res)  # 去除<font></font>
    res = re.sub(r'<b>(.*?)<\/b>', r'\1', res)  # 去除<b></b>
    res = re.sub(r'<u>(.*?)<\/u>', r'\1', res)  # 去除<u></u>
    res = re.sub(r'<o(.*?)<\/o.*?>', r'', res)  # 去除<o:p></o:p>
    res = re.sub(r'<html.*?>(.*?)<\/html>', r'<!DOCTYPE html>\1</html>', res)
    res = re.sub(r'mso-.*?;', r'', res)  # 去除mso- 开头的属性
    res = re.sub(r'class=".*?"', r'', res)  # 去除默认添加的class
    res = re.sub('font-family:([\u4e00-\u9fa5]*?);', r'', res)  # 去除font-family: 等线;
    res = re.sub(r'(:\d*?\.\d).*?(pt;)', r'\1\2', res)  # 属性值只保留一位小数
    return res

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        for attr in attrs:
            if attr[0] == 'style':
                if attr[1] not in globalClsList:
                    globalClsList.append(attr[1])
        # print("Encountered a start tag:", tag)

    def handle_endtag(self, tag):
        pass
        # print("Encountered an end tag :", tag)

    def handle_data(self, data):
        pass
        # print("Encountered some data  :", data)

# 合并内容相同的标签
def my_beautiful_soup(html):
    soup = BeautifulSoup(html, "html.parser")
    mso_tag = soup.find_all(style=True)
    for tag in mso_tag:
        tag_style = tag['style']
        idx = globalClsList.index(tag_style)
        tag['class'] = 'cls' + str(idx)
        del tag['style']
        if not tag.previous_sibling is None:
            if tag.previous_sibling.name == tag.name and (not tag.previous_sibling['class'] is None) and tag.previous_sibling['class'] == tag['class']:
                if (not tag.string is None) and (not tag.previous_sibling.string is None):
                    tag.string = tag.previous_sibling.string + tag.string
                    tag.previous_sibling.decompose()

    return soup

# 保存格式化好的文件
def save_file(res, html_path):
    with codecs.open(html_path, 'w+', 'utf-8') as out:
        out.write(res)
        out.close()

init()

word转html,20210908补充现在用的 word2html.py 文件view raw

# -*- coding: utf-8 -*-
import codecs
from win32com import client as wc
import re
import chardet
from bs4 import BeautifulSoup, Comment
import os
import shutil


def init():
    path = os.getcwd()  # 文件夹目录
    word_path = os.path.join(path, 'word')
    files = os.listdir(word_path)  # 得到文件夹下的所有文件名称
    html_dir = os.path.join(path, 'html')

    # 如果目录存在，直接删除
    if os.path.exists(html_dir):
        shutil.rmtree(html_dir)

    os.mkdir(html_dir)

    for file in files:  # 遍历文件夹
        file_split = file.split('.')
        # os.path.isfile 需要传入文件的绝对路径
        if (os.path.isfile(os.path.join(word_path, file))) and (file_split[len(file_split)-1] == 'doc' or file_split[len(file_split)-1] == 'docx'):
            main(word_path, html_dir, file)


def main(word_dir, html_dir, word_name):
    word_name_split = word_name.split('.')
    word_name_split.pop()
    html_name = '.'.join(word_name_split) + '.html'
    # html_name1 = '.'.join(word_name_split) + '1.html'
    html = word_to_html(os.path.join(word_dir, word_name), os.path.join(html_dir, html_name))

    html = format_html(html)
    html = my_beautiful_soup(html)

    # html = re.sub(r'&nbsp;{2,}', r'&nbsp;', html)  # 去空格
    html = re.sub(r'\s+', r' ', html)  # 多个空格合并成一个 nbsp;被删除问题

    html = re.sub(r'<p>\s*?(\S*?)\s*?</p>', r'<p>\1</p>', html)  # 删除p元素首尾空格
    html = re.sub(r'<strong>\s*?(\S*?)\s*?</strong>', r'<strong>\1</strong>', html)  # 删除strong元素首尾空格
    # 在我们的需求里，只需要body内的内容，所以这样；不是这样的情况下，应该去掉这句
    html = re.sub(r'(.*?)<body(.*?)>(.*)<\/body>(.*)', r'<!-- () -->\3', html, flags=re.S)
    html = format_html1(html)
    html = format_html1(html)
    # print(html)
    # pattern = re.compile(r'<p.*?><\/p>')
    # print(pattern.findall(html))

    save_file(html, os.path.join(html_dir, html_name))


# doc_path - word 文档地址 export_path - 导出的 html 地址
def word_to_html(doc_path, export_path):
    try:
        word = wc.Dispatch('Word.Application')
        doc = word.Documents.Open(doc_path)
        doc.SaveAs(export_path, 10)
    finally:
        if('doc' in dir()) and doc.Close:
            doc.Close()
        if('word' in dir()) and word.Quit:
            word.Quit()
    f = open(export_path, 'r')
    str = f.read()
    f.close()
    return str


# 格式化html
# res - 导出的 html
def format_html(res):
    # 在我们的需求里，只需要body内的内容，所以去掉；不是这样的情况下，应该恢复使用
    # res = re.sub(r'<html.*?>', r'<!DOCTYPE html>', res, flags=re.S)

    # 去掉批注
    res = re.sub(r'<div>.*?<hr\s+class=msocomoff.*?<\/body>', r'</body>', res, flags=re.S)
    res = re.sub(r'<a\s+class=msocomanchor.*?<\/a>', r'', res, flags=re.S)

    res = re.sub(r'\s+>', r'>', res)  # 标签结尾的空格
    res = re.sub(r'>\s+<', r'><', res)  # 去除标签之间的空格

    res = re.sub(r'<b>(.*?)<\/b>', r'<strong>\1</strong>', res, flags=re.S)

    return res


def format_html1(res):
    res = re.sub(r'</u><u>', r'', res)  # 合并连续的u元素
    res = re.sub(r'</strong><strong>', r'', res)  # 合并连续的strong元素
    res = re.sub(r'<strong>\s*?</strong>', '', res)
    res = re.sub(r'<p>\s*?</p>', r'', res)  # 空的p标签
    res = re.sub(r'<h\d><\/h\d>', '', res)  # 空的标题元素
    res = re.sub(r'<p align="center" style="text-align:center;"></p>', '', res)  # 空的标题元素
    return res


def my_beautiful_soup(html):
    soup = BeautifulSoup(html, "html5lib")

    # 在我们的需求里，只需要body内的内容，所以去掉；不是这样的情况下，应该恢复使用
    # head_tag = soup.new_tag('head')
    # meta1 = soup.new_tag('meta', attrs={'name': 'viewport', 'content': 'width=device-width,initial-scale=1,maximum-scale=1,minimum-scale=1,user-scalable=no'})
    # meta2 = soup.new_tag('meta', attrs={'content': 'text/html', 'charset': 'utf-8', 'http-equiv': 'Content-Type'})
    # title = soup.new_tag('title')
    # head_tag.append(meta1)
    # head_tag.append(meta2)
    # head_tag.append(title)
    # soup.head.replace_with(head_tag)

    # 遍历节点
    for tag in soup.find_all(True):
        # 删除 class
        del tag['class']

        if not tag.get('style') is None:
            tag_style = tag['style']
            del tag['style']  # 删除元素的 style
            del tag['align']  # 删除元素的 align
            # 增加居中样式
            if 'text-align:center' in tag_style:
                tag['style'] = 'text-align:center;'
            # 增加右对齐样式
            elif 'text-align:right' in tag_style:
                tag['style'] = 'text-align:right;'

    # 删除span标签
    spans = soup.find_all('span')
    for span in spans:
        span.unwrap()

    # 删除a标签
    atags = soup.find_all('a')
    for atag in atags:
        atag.unwrap()

    # 删除font标签
    font_tags = soup.find_all('font')
    for font_tag in font_tags:
        font_tag.unwrap()

    # 删除ins标签
    ins_tags = soup.find_all('ins')
    for ins_tag in ins_tags:
        ins_tag.unwrap()

    # 删除i标签
    i_tags = soup.find_all('i')
    for i_tag in i_tags:
        i_tag.unwrap()

    # 删除<o:p>标签
    op_tags = soup.find_all('o:p')
    for op_tag in op_tags:
        op_tag.unwrap()

    # 删除<del>标签及内容
    del_tags = soup.find_all('del')
    for del_tag in del_tags:
        del_tag.decompose()

    # 删除注释
    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()

    return str(soup).encode().decode('utf-8')


def my_beautiful_soup_after(html):
    soup = BeautifulSoup(html, "html5lib")
    for tag in soup.find_all(True):
        if tag.string is None and len(tag.contents) == 0:
            tag.replace_with('')
    return str(soup).encode().decode('utf-8')


# 保存格式化好的文件
def save_file(res, html_path):
    with codecs.open(html_path, 'w+', 'utf-8') as out:
        out.write(res)
        out.close()

init()

6. 其他
用python创建 docx 文件
 别人用js实现的word2html
python-docx
mammoth.js

使用BeautifulSoup删除html中的scriptview raw

#! /usr/bin/env python
# -*- coding: utf-8 -*-

from BeautifulSoup import BeautifulSoup
html = '''
<script>a</script>
baba
<script>b</script>
<h1>hi, world</h1>
'''
soup = BeautifulSoup('<script>a</script>baba<script>b</script><h1>')
[s.extract() for s in soup('script')]
# 或者： [s.extract() for s in soup.findAll('script')]
print soup

# 输出
# baba<h1></h1>

使用BeautifulSoup删除html中的注释view raw

#! /usr/bin/env python
# -*- coding: utf-8 -*-

from BeautifulSoup import BeautifulSoup, Comment
data = """<div class="foo">
cat dog sheep goat
<!--
<p>test</p>
-->
</div>"""

soup = BeautifulSoup(data)

for element in soup(text=lambda text: isinstance(text, Comment)):
    element.extract()

print soup.prettify()

# 输出结果：
# <div class="foo">
#  cat dog sheep goat
# </div>