您现在的位置：首页 > 教案格式 > 正文

html 试题试卷（包含latex）下载成word - - java(2)

2020-08-17 11:07 网络整理教案网

到此我们就可以将文本的简洁数据填充到模版，并下载成word了，接下来我们要做的就是将html格式的试题转换为相应的word格式，填充到模版

解析html试题内容，将其结构化：

1. 引入XPath jar包

<dependency>
    <groupId>cn.wanghaomiaogroupId>
    <artifactId>JsoupXpathartifactId>
    <version>2.2version>
dependency>

2. 解析HTML试题内容

/**
 * @Desc: 标签节点
 **/
public class Node {
    private String nodeName;
    private Integer nodeType;//1:标签节点；2:文本节点
    private Map attrMap;
    private String nodeText;
    private String nodeStr;
    private String childStr;
    private String nodeParent;
    private  List childNodeList;
    public String getNodeName() {
        return nodeName;
    }
    public void setNodeName(String nodeName) {
        this.nodeName = nodeName;
    }
    public Integer getNodeType() {
        return nodeType;
    }
    public void setNodeType(Integer nodeType) {
        this.nodeType = nodeType;
    }
    public Map getAttrMap() {
        return attrMap;
    }
    public void setAttrMap(Map attrMap) {
        this.attrMap = attrMap;
    }
    public String getNodeText() {
        return nodeText;
    }
    public void setNodeText(String nodeText) {
       // nodeText=nodeText.replaceAll("[\n\r\t]","");
        nodeText=nodeText.trim();
        if(nodeText==null||nodeText.equals("")) {nodeText=null;}
        this.nodeText = nodeText;
    }
    public String getNodeStr() {
        return nodeStr;
    }
    public void setNodeStr(String nodeStr) {
        nodeStr=nodeStr.replaceAll("[\n\r\t]","");
        this.nodeStr = nodeStr;
    }
    public String getChildStr() {
        return childStr;
    }
    public void setChildStr(String childStr) {
        this.childStr = childStr;
    }
    public String getNodeParent() {
        return nodeParent;
    }
    public void setNodeParent(String nodeParent) {
        this.nodeParent = nodeParent;
    }
    public List getChildNodeList() {
        return childNodeList;
    }
    public void setChildNodeList(List childNodeList) {
        this.childNodeList = childNodeList;
    }
}

解析html

 /**
     *@Author: maoyuwei
     *@Date: 2019/9/7 15:29
     *@Desc: 提取一级标签节点
     */
    public  static  List htmlStrToNodes(String htmlStr){
        htmlStr.replace("##","");//##为提取文本的分隔符，不能出现在题文中
        JXDocument jxDocument=JXDocument.create(htmlStr);
        String bodyPath = "/body";
        JXNode bodyNode=jxDocument.selNOne(bodyPath);
        Node node = new Node();
        node.setNodeType(1);
        node.setNodeName(bodyNode.asElement().nodeName());
        node.setChildStr(bodyNode.asElement().html());
        List nodes=childStrToNodes(node,bodyNode);
        if(nodes==null||nodes.size()<1) {
            nodes=new ArrayList();
            nodes.add(node);
        }
        return nodes;
    }
    /**
     *@Author: maoyuwei
     *@Date: 2019/9/7 15:29
     *@Desc:  提取当前标签下的子标签节点
     */
    public  static List childStrToNodes(Node node,JXNode jxNode){
        String childPath = "/child::*";
        List childJXNodeList=jxNode.sel(childPath);
        if(childJXNodeList==null||childJXNodeList.size()<1){
            if(node.getChildStr()!=null||!node.getChildStr().equals("")){
                node.setNodeText(node.getChildStr().replaceAll("[\n\r\t]",""));
            }
            return null;
        }
        List tempChildNodes=new ArrayList();
        String currentHtmlStr=node.getChildStr();
        int index=0;
        for(JXNode childJXNode:childJXNodeList) {
            String childNodeStr=childJXNode.toString();
            //标记本级文本
            int beginIndex=currentHtmlStr.indexOf(childNodeStr);
            if(beginIndex<0) {continue;}
            currentHtmlStr=currentHtmlStr.substring(0,beginIndex)+"##<"+index+">"+currentHtmlStr.substring(beginIndex+childNodeStr.length());
            index++;
            Node childNode=new Node();
            Element element=childJXNode.asElement();
            childNode.setNodeName(element.nodeName());
            childNode.setNodeStr(childJXNode.toString());
            childNode.setNodeParent(node.getNodeName());
            childNode.setChildStr(element.html());
            childNode.setNodeType(1);
            //属性逐级下沉，每个子标签都带父级的属性
            Map attrMap= JsonUtil.fromJson(JsonUtil.toJson(node.getAttrMap()), Map.class);
            if (attrMap==null) {attrMap=new HashMap();}
            if (childNode.getNodeName()!=null&&childNode.getNodeName().equals("tr")) {attrMap=new HashMap<>();}//表格标签tr中不写入父级属性
            if(element.attributes()!=null) {
              for(Attribute attribute: element.attributes().asList())  {
                  if(attribute.getKey().equals("style")){
                      String style=attrMap.get(attribute.getKey())==null?"":attrMap.get(attribute.getKey());
                      attrMap.put(attribute.getKey(),style+" "+attribute.getValue());
                  }else {
                      attrMap.put(attribute.getKey(),attribute.getValue());
                  }
              }
            }
            //标签所带属性写入
            if(childNode.getNodeName()!=null&&"b i u B I U".contains(childNode.getNodeName())){
                String style=attrMap.get("style")==null?"":attrMap.get("style");
                attrMap.put("style",style+" _"+childNode.getNodeName().toLowerCase());
            }
            childNode.setAttrMap(attrMap);
            childStrToNodes(childNode,childJXNode);
            tempChildNodes.add(childNode);
        }
        //提取取本级文本，和本级标签节点按顺序存储
        String[] textList=currentHtmlStr.split("##");
        List childNodes=new ArrayList();
        for(int i=0;i){
            String textStr=textList[i];
            if(textStr==null||textStr.equals("")) {continue;}
            String  regex="(?<=<)[0-9]+(?=>)";
            Pattern pattern = Pattern.compile(regex);
            Matcher matcher = pattern.matcher(textStr);
            Integer position=null;
            if(matcher.find()) {
                String positionStr=matcher.group();
                position=Integer.parseInt(positionStr.toString());
            }
            if(position!=null&&position<tempChildNodes.size()){
                childNodes.add(tempChildNodes.get(position));
            }
            textStr=textStr.replaceAll("<[0-9]+>","");
            textStr=textStr.replaceAll("[\n\r\t]","");
            if(textStr==null||textStr.equals("")) {continue;}
            Node textNode=new Node();
            textNode.setNodeType(2);
            textNode.setNodeText(textStr);
            Map attrMap= JsonUtil.fromJson(JsonUtil.toJson(node.getAttrMap()), Map.class);
            textNode.setAttrMap(attrMap);
            textNode.setNodeParent(node.getNodeName());
            childNodes.add(textNode);
        }
        node.setChildNodeList(childNodes);
        return  childNodes;
    }

以上代码就可以将html的题文结构化，接下来我们将这个结构化的题文翻译成word格式

将结构化的题文翻译为word格式：

这块的代码相当多，写的也不好，就不粘了，大家可以看下我的源码看（看 com\pdl\paperdownload\wordpapermake\htmltowordhandle 下面的类就行），简单介绍一下这块的内容：

试卷字体格式_试卷格式模板_标准试卷格式

1. 将结构化的题文里的每个node 解析成word的格式试卷格式模板，得自己总结HTML颜色跟word样式的对应关系

下面简单罗列几个对应样式：

· 下面蓝色部分为word段落样式设定位置；黄色部分为内容风格设定位置

<w:p w:rsidR="00A81065" w:rsidRDefault="00BF47F0">
    <w:pPr>
        
        
        
    w:pPr>
    <w:r w:rsidRPr="00043B54">
        <w:t xml:space="preserve"> 【解析】w:t>
    w:r>
    <w:r>
        <w:rPr>
            
            
        w:rPr>
        <w:t>试题分析w:t>
    w:r>
w:p>

· 居中：word段落样式

<w:jc w:val="center"/>

· 首行缩进：word段落样式

<w:ind w:firstLineChars="200" w:firstLine="480"/>

· 右对齐：word段落样式

<w:jc w:val="right"/>

· 加粗：内容样式

<w:b/>

· 斜体：内容样式

<w:i/>

· 下划线：内容样式

<w:u w:val="single"/>

· 允许内容中的空格显示设定（绿色部分为修改部分）

<w:r>
    <w:t xml:space="preserve"> hello pretty girl！w:t>
w:r>

· 上下标 sub sup：内容样式

<w:vertAlign w:val="superscript"/>
<w:vertAlign w:val="subscript"/>

试卷格式模板_标准试卷格式_试卷字体格式

其他格式：

· omml 公式放在句子标签后面（关于latex转换为word公式omml的方式，可以看我的另一篇分享：）

· 图片部分放在段落标签后面

textAlignment w:val="center" 为截图行内居中

设置图片宽高



注：doc的图片得将它的base64码，和引用定义填充到模板；docx的图片放到word\media文件夹下，并将引用定义填充到模板

<w:p w:rsidR="00A81065" w:rsidRDefault="00BF47F0" w:rsidP="00744A41">
    <w:pPr>
        <w:textAlignment w:val="center"/>
        <w:rPr>
            <w:rFonts w:hint="eastAsia"/>
        w:rPr>
    w:pPr>
    <w:r>
        <w:pict>
            <v:shape id="_x0000768de9d0ea6111e98c2a28843b052b2f" type="_x0000_t75" style="width:85pt;height:43pt">
                <v:imagedata r:id="rId768de9d0ea6111e98c2a28843b052b2f" o:title="2"/>
            v:shape>
        w:pict>
    w:r>
w:p>

· 表格部分放在和同级

w:gridCol w:w="222" 列宽设置

 相当于html table中的

 相当于html table中的 
注意：word合并单元格的方式和HTML不太一样，不好描述，多试试自己就能发现规律

<w:tbl>
    <w:tblPr>
        <w:tblW w:w="0" w:type="auto"/>
        <w:tblLook w:val="04A0" w:firstRow="1" w:lastRow="0" w:firstColumn="1" w:lastColumn="0" w:noHBand="0" w:noVBand="1"/>
    w:tblPr>
    <w:tblGrid>
        <w:gridCol w:w="222"/>
        <w:gridCol w:w="1376"/>
    w:tblGrid>
    <w:tr w:rsidR="00A93926" w:rsidTr="00682485">
        <w:tc>
            <w:tcPr>
                <w:tcW w:w="0" w:type="auto"/>
            w:tcPr>
            <w:p w:rsidR="00A93926" w:rsidRDefault="00286E3F" w:rsidP="00A93926"/>
        w:tc>
        <w:tc>
            <w:tcPr>
                <w:tcW w:w="0" w:type="auto"/>
                <w:vAlign w:val="center"/>
            w:tcPr>
            <w:p w:rsidR="00A93926" w:rsidRPr="00A93926" w:rsidRDefault="00BF47F0" w:rsidP="00A93926">
                <w:pPr>
                    <w:rPr>
                        <w:b/>
                    w:rPr>
                w:pPr>
                <w:r>
                    <w:rPr>
                        <w:rFonts w:hint="eastAsia"/>
                        <w:b/>
                    w:rPr>
                    <w:t>一、w:t>
                w:r>
                <w:r>
                    <w:rPr>
                        <w:rFonts w:hint="eastAsia"/>
                        <w:b/>
                    w:rPr>
                    <w:t xml:space="preserve"> w:t>
                w:r>
                <w:r>
                    <w:rPr>
                        <w:rFonts w:hint="eastAsia"/>
                        <w:b/>
                    w:rPr>
                    <w:t>解答题w:t>
                w:r>
            w:p>
        w:tc>
    w:tr>
w:tbl>

另外应留意的html有一些特殊字符，得提前转义，不然会导致word打不开

  =>  
< => <
> => >
& => &
" => "
' => '
¢ => ¢
£ => £
¥ => ¥
€ => €
§ => §
© => ©
® => ®
™ => ™
× => ×
÷ => ÷
  =>  
  =>  
Α => Α
Γ => Γ
Ε => Ε
Η => Η
Ι => Ι
Λ => Λ
Ν => Ν
Ο => Ο
Ρ => Ρ
Τ => Τ
Φ => Φ
Ψ => Ψ
α => α
γ => γ
ε => ε
η => η
ι => ι
λ => λ
ν => ν
ο => ο
ρ => ρ
σ => σ
υ => υ
χ => χ
ω => ω
ϒ => ϒ
• => •
′ => ′
‾ => ‾
℘ => ℘
ℜ => ℜ
ℵ => ℵ
↑ => ↑
↓ => ↓
↵ => ↵
⇑ => ⇑
⇓ => ⇓
∀ => ∀
∃ => ∃
∇ => ∇
∉ => ∉
∏ => ∏
− => −
√ => √
∞ => ∞
∧ => ⊥
∩ => ∩
∫ => ∫
∼ => ∼
≈ => ≅
≡ => ≡
≥ => ≥
⊃ => ⊃
⊆ => ⊆
⊕ => ⊕
⊥ => ⊥
⌈ => ⌈
⌊ => ⌊
◊ => ◊
♣ => ♣
♦ => ♦
¡ => ¡
£ => £
¥ => ¥
§ => §
© => ©
« => «
 => 
¯ => ¯
± => ±
³ => ³
µ => µ
Β => Β
Δ => Δ
Ζ => Ζ
Θ => Θ
Κ => Κ
Μ => Μ
Ξ => Ξ
Π => Π
Σ => Σ
Υ => Υ
Χ => Χ
Ω => Ω
β => β
δ => δ
ζ => ζ
θ => θ
κ => κ
μ => μ
ξ => ξ
π => π
ς => ς
τ => τ
φ => φ
ψ => ψ
ϑ => ϑ
ϖ => ϖ
… => …
″ => ″
⁄ => ⁄
ℑ => ℑ
™ => ™
← => ←
→ => →
↔ => ↔
⇐ => ⇐
⇒ => ⇒
⇔ => ⇔
∂ => ∂
∅ => ∅
∈ => ∈
∋ => ∋
∑ => −
∗ => ∗
∝ => ∝
∠ => ∠
∨ => ⊦
∪ => ∪
∴ => ∴
≅ => ≅
≠ => ≠
≤ => ≤
⊂ => ⊂
⊄ => ⊄
⊇ => ⊇
⊗ => ⊗
⋅ => ⋅
⌉ => ⌉
⌋ => ⌋
♠ => ♠
♥ => ♥
  =>  
¢ => ¢
¤ => ¤
¦ => ¦
¨ => ¨
ª => ª
¬ => ¬
® => ®
° => °
² => ²
´ => ´
· => ·
ø => ø
á => á

根据以上规则，结合其它应该的规则，做html结构化题文的翻译，具体翻译代码很多不在这里展示

源码：

注意：spring boot 的项目，启动后访问 localhost:8080 到下载样例页：

可选择试卷，下载格式，纸张大小，字号，答案样式

试卷json数据放在：resources\paperdata

请求方法位置：com\pdl\paperdownload\main.java

总结：

整个过程就是一个了解HTML结构，了解word结构，然后翻译，拼接出一个完整word的过程

遇到问题：经常有格式错误或特殊字符造成word打不开的状况，这个是一大痛点，但经过不断优化这种的状况已越来越少了

共2页:

试卷格式模板相关阅读

教案格式热门阅读

html 试题试卷（包含latex）下载成word - - java(2)

教案格式排行

教案格式看点