三种方式实现mhtml格式转为excel
pdf转word, ppt转pdf等office,pdf格式相互转换都比较容易实现,比如可以用openoffice,onlyoffice等。但是mhtml(带有MIME附件的html文件)转excel,网上一搜可能没有那么多资料,本文将介绍三种方案将mhtml转为excel。
三种方式实现mhtml格式转为excel

        pdf转word, ppt转pdf等office,pdf格式相互转换都比较容易实现,比如可以用openoffice,onlyoffice等。但是mhtml(带有MIME附件的html文件)转excel,网上一搜可能没有那么多资料,本文将介绍三种方案将mhtml转为excel。

一.使用javax.mail解析mhtml

优点:可以按照自己的需要解析指定的内容和标签;

缺点:解析时整个文件是一次性加载到内存中,内存占用较大;需要自己解析指定的内容;

1.maven依赖

<dependency>
	<groupId>javax.mail</groupId>
	<artifactId>mail</artifactId>
	<version>1.4.1</version>
</dependency>
<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>1.14.3</version>
</dependency>

2.代码示例

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Properties;

import javax.mail.BodyPart;
import javax.mail.Multipart;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


/**
 * description:MHTMLReader
 *
 * @author: lgq
 * @create: 2024-05-16 15:36
 */
public class MHTMLReader {
    public static void main(String[] args) {
        for (int i = 0; i< 100;i++) {
            int random = (int)(Math.random()*(11));
            System.out.println(random);
        }
        try {
            File mhtmlFile = new File("C:\\Users\\lgq\\Downloads\\003-r.mhtml");
            InputStream inputStream = new FileInputStream(mhtmlFile);
            MimeMessage mimeMessage = new MimeMessage(Session.getDefaultInstance(new Properties(), null), inputStream);
            // 解析MimeMessage对象,获取所需信息
            Object content = mimeMessage.getContent();
            if (content instanceof Multipart) {
                Multipart multipart = (Multipart) content;
                BodyPart bodyPart = multipart.getBodyPart(0);

                Object content1 = bodyPart.getContent();
                //获取内容
                String s = content1.toString();
                Document document = Jsoup.parse(s);
                Elements elements = document.select("tr");
                for (Element element : elements) {
                    String text = element.text();
                    System.out.println(text);
                }
            }
        } catch (Exception e ) {
            e.printStackTrace();
        }
    }
}


二.使用aspose解析mhtml

优点:可以直接将mhtml格式转为其他格式,比如xlsx;

缺点:内存占用非常大,多并发情况下很容易造成服务崩溃;

1.maven依赖

<dependency>
	<groupId>com.aspose</groupId>
	<artifactId>aspose-cells</artifactId>
	<version>20.7</version>
	<scope>system</scope>
	<systemPath>${project.basedir}/src/main/resources/lib/aspose-cells-20.7-crack.jar</systemPath>
</dependency>

2.代码示例

package com.abc.def.service.impl;

import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.concurrent.Future;

import javax.annotation.PostConstruct;

import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.ExcelWriter;
import com.alibaba.excel.support.ExcelTypeEnum;
import com.alibaba.excel.write.metadata.WriteSheet;
import com.aspose.cells.License;
import com.aspose.cells.LoadFormat;
import com.aspose.cells.LoadOptions;
import com.aspose.cells.MemorySetting;
import com.aspose.cells.Workbook;
import com.aspose.cells.Worksheet;
import com.abc.def.common.utils.EasyExcelUtil;
import com.abc.def.listener.EasyExcelGeneralCsvListener;
import com.abc.def.service.ExcelAnalysisService;

import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.AsyncResult;
import org.springframework.stereotype.Service;

/**
 * description:ExcelAnalysisService实现类
 *
 * @author: lgq
 * @create: 2024-04-17 14:53
 */
@Service
@Slf4j
public class ExcelAnalysisServiceImpl implements ExcelAnalysisService {

    @Async("asyncExcelAnalysisServiceExecutor")
    @Override
    public Future<String> csv2Excel(InputStream inputStream, OutputStream outputStream, String sheetName, boolean isGzip) {
        try {
            ExcelWriter writer = EasyExcel.write(outputStream).excelType(ExcelTypeEnum.XLSX).build();

            EasyExcel.read(inputStream, new EasyExcelGeneralCsvListener(writer, new WriteSheet()))
                    .excelType(ExcelTypeEnum.CSV)
                    .charset(Charset.forName("UTF-8"))
                    .sheet()
                    .sheetNo(0)
                    .sheetName(sheetName)
                    .doRead();

            writer.finish();
            outputStream.flush();
        } catch (Exception e) {
            log.error("csv转为excel出错!", e.getMessage());
            e.printStackTrace();
        } finally {
            if (outputStream != null) {
                try {
                    outputStream.close();
                } catch (Exception e) {
                    log.error("outputStream.close() -> csv转为excel出错!", e.getMessage());
                    e.printStackTrace();
                }
            }
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (Exception e) {
                    log.error("inputStream.close() -> csv转为excel出错!", e.getMessage());
                    e.printStackTrace();
                }
            }
        }
        return new AsyncResult<>("task complete!");
    }

    @Async("asyncExcelAnalysisServiceExecutor")
    @Override
    public Future<String> mhtmlAsExcel(InputStream inputStream, OutputStream outputStream, String sheetName, boolean isGzip) {
        try {
            LoadOptions lo = new LoadOptions(LoadFormat.M_HTML); // 设置加载选项
            lo.setMemorySetting(MemorySetting.MEMORY_PREFERENCE);
            Workbook workbook = new Workbook(inputStream, lo);
            workbook.getWorksheets().get(0).setName(sheetName); // 设置工作簿的名称
            workbook.getWorksheets().get(0).setGridlinesVisible(true); // 显示网格线
            workbook.getWorksheets().get(0).autoFitColumns();// 设置自适应列宽
            workbook.getWorksheets().get(0).autoFitRows();// 设置自适应行高
            workbook.save(outputStream, LoadFormat.XLSX);
        } catch (Exception ioException) {
            log.error("workbook.write() -> 另存为excel出错!", ioException.getMessage());
            ioException.printStackTrace();
        } finally {
            if (outputStream != null) {
                try {
                    outputStream.close();
                } catch (Exception e) {
                    log.error("outputStream.close() -> csv转为excel出错!", e.getMessage());
                    e.printStackTrace();
                }
            }
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (Exception e) {
                    log.error("inputStream.close() -> csv转为excel出错!", e.getMessage());
                    e.printStackTrace();
                }
            }
        }
        return new AsyncResult<>("task complete!");
    }

    @PostConstruct
    public void initAspose() {
        try (InputStream is = com.aspose.cells.License.class.getResourceAsStream("/com.aspose.cells.lic_2999.xml")) {
            License asposeLicense = new License();
            asposeLicense.setLicense(is);
        } catch (Exception e) {
            log.error("initAspose -> 初始化加载license文件失败!", e.getMessage());
            e.printStackTrace();
        }
    }
}

3.aspose jar包参考

 从gitee上找到了一位大佬的学习实践教程,作者研究了aspose.cells的 20.7 的版本,距离最新版本发布相距 3 年多近 40 个Release版本,可参考:https://gitee.com/evilrule/crack-aspose,有需要旧版的可以移步查阅和实践,确实可以把《Evaluation Warning》警告去除;但是不可以进行商用

三.使用windows的office.exe解析mhtml(模拟打开另存为xlsx操作)

优点:把mhtml格式转换转发给office软件执行,java服务没有内存压力;

缺点:为了更换支持格式转换,一般需要部署在windows服务器上;

1.maven依赖

无。

2.代码示例

package com.abc.def.handler;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

import javax.servlet.ServletOutputStream;
import javax.servlet.http.HttpServletResponse;

import com.abc.def.common.utils.EasyExcelUtil;
import com.abc.def.common.utils.ObjectUtil;
import com.abc.def.pojo.enums.ConversionType;

import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.AsyncResult;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;

import static com.abc.def.common.utils.EasyExcelUtil.gzFileExtractor;
import static com.abc.def.pojo.enums.EpDownloadErrorCode.FILE_TOO_LARGE;
import static com.abc.def.pojo.enums.EpDownloadErrorCode.FILE_TYPE_ERROR;
import static com.abc.def.pojo.enums.EpDownloadErrorCode.FREQUENCY_TOO_HIGH_ERROR;

/**
 * description:mhtml转为excel
 *
 * @author: lgq
 * @create: 2024-05-17 11:27
 */
@Slf4j
@Component("mhtml2ExcelConversion")
public class Mhtml2ExcelConversion implements ConversionFunction {
    private static final String PS1_PATH = "D:\\ep\\mhtml2excel1.ps1";
    @Override
    public String converse(MultipartFile file, HttpServletResponse response) {
        String filename = file.getOriginalFilename();
        String fileType = "MHTML";
        String[] splitName = filename.split(".xls");
        if (ObjectUtil.isEmpty(splitName) || ObjectUtil.isEmpty(splitName[0])) {
            String errorMsg = "文件格式不对,请检查格式!";
            EasyExcelUtil.setResponseParam(response, "Process-Result", String.valueOf(FILE_TYPE_ERROR.getCode()));
            EasyExcelUtil.setResponseParam(response, "Content-Type", "text/plain");
            return errorMsg;
        }
        String pureFilename = splitName[0];
        EasyExcelUtil.setResponseParam(response, pureFilename);

        File tempFile = null;
        String targetFilePath = null;
        InputStream inputStream = null;
        try {
            inputStream = file.getInputStream();
            String tempFilePath = gzFileExtractor(inputStream);
            if (!ObjectUtil.isEmpty(tempFilePath) && "false".equals(tempFilePath)) {
                tempFilePath = UUID.randomUUID().toString();
                tempFile = new File(tempFilePath);
                if (!tempFile.exists()) {
                    tempFile.createNewFile();
                }
                tempFilePath = tempFile.getAbsolutePath();
                try (InputStream inputStream1 = file.getInputStream()) {
                    Files.copy(inputStream1, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
                } catch (Exception e) {
                    log.error("MultipartFile 转为 File 失败!(mhtml)", e.getMessage());
                    e.printStackTrace();
                }
            } else {
                tempFile = new File(tempFilePath);
            }
            long fileLength = tempFile.length();

            log.info("the length of origin file is {} ", fileLength); //172460518
            if (fileLength > 100000000) {
                String errorMsg = "导出excel文件超过128M,请缩小查询范围后再进行导出!";
                EasyExcelUtil.setResponseParam(response, "Process-Result", String.valueOf(FILE_TOO_LARGE.getCode()));
                EasyExcelUtil.setResponseParam(response, "Content-Type", "text/plain");
                return errorMsg;
            }
            targetFilePath = tempFilePath + ".xlsx";
            Future<String> future = mhtmlAsExcel(tempFilePath, targetFilePath, response);
            return future.get();
        } catch (IOException ioException) {
            log.error("{}转为excel出错!", fileType, ioException.getMessage());
            ioException.printStackTrace();
        } catch (InterruptedException interruptedException) {
            log.error("{}转为excel出错!", fileType, interruptedException.getMessage());
            interruptedException.printStackTrace();
        } catch (ExecutionException executionException) {
            log.error("{}转为excel出错!", fileType, executionException.getMessage());
            executionException.printStackTrace();
        } catch (RuntimeException x) {
            log.error("{}转为excel出错!", fileType, x.getMessage());
            x.printStackTrace();
        } catch (Error x) {
            log.error("{}转为excel出错!", fileType, x.getMessage());
            x.printStackTrace();
        } catch (Throwable x) {
            log.error("{}转为excel出错!", fileType, x.getMessage());
            x.printStackTrace();
        } finally {
            if (tempFile != null && tempFile.delete()) {
                log.info("原始文件{}删除成功!", tempFile.getName());
            } else if (tempFile != null) {
                log.error("原始文件{}删除失败!", tempFile.getName());
            }
            try {
                if (!ObjectUtil.isEmpty(targetFilePath)) {
                    File targetFile = new File(targetFilePath);
                    if (targetFile.exists() && targetFile.delete()) {
                        log.info("转换后的文件{}删除成功!", targetFile.getName());
                    } else if (targetFile.exists()) {
                        log.error("转换后的文件{}删除失败!", targetFile.getName());
                    }
                }

            } catch (Exception e) {
                log.error("targetFile.delete() -> 删除转换后的文件出错!", e.getMessage());
                e.printStackTrace();
            }
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (Exception e) {
                    log.error("inputStream.close() -> csv转为excel出错!", e.getMessage());
                    e.printStackTrace();
                }
            }
        }
        return "success";
    }


    @Async("asyncExcelAnalysisServiceExecutor")
    public Future<String> mhtmlAsExcel(String originFileName, String targetFileName, HttpServletResponse response)
            throws Exception {
        String powershellCommand = "powershell.exe "+ PS1_PATH +" " + originFileName + " " + targetFileName;
        try {
            // 定义PowerShell 命令
            String baseShellCommand = "Get-Process | Select-Object -First 5";
            // 创建ProcessBuilder 实例
            ProcessBuilder processBuilder = new ProcessBuilder("powershell", "-Command", powershellCommand);
            //启动进程
            Process process = processBuilder.start();

            BufferedReader reader = new BufferedReader(new InputStreamReader(process.getErrorStream(), Charset.forName("GBK")));
            String line;
            int errorLogLength = 0;
            while ((line = reader.readLine()) != null) {
                log.error(line);
                errorLogLength += line.length();
                break;
            }
            if (errorLogLength > 0) {
                String errorMsg = "mhtml转换为excel过于频繁,请稍后再试!";
                log.error(errorMsg);
                process.destroy();
                EasyExcelUtil.setResponseParam(response, "Process-Result", String.valueOf(FREQUENCY_TOO_HIGH_ERROR.getCode()));
                EasyExcelUtil.setResponseParam(response, "Content-Type", "text/plain");
                return new AsyncResult<>(errorMsg);
            }
            //等待进程结束并获取退出码

            int exitCode = process.waitFor();
            if (exitCode == 0) {
                log.info("Exited with error code :{} ", exitCode);
                File file = new File(targetFileName);
                if (file.exists()) {
                    // 读取本地文件
                    try (InputStream inputStream = new BufferedInputStream(new FileInputStream(file));
                         ServletOutputStream outputStream = response.getOutputStream()) {
                        byte[] buffer = new byte[1024];
                        int bytesRead;
                        // 将文件内容写入响应
                        while ((bytesRead = inputStream.read(buffer)) != -1) {
                            outputStream.write(buffer, 0, bytesRead);
                        }
                        outputStream.flush();
                    } catch (Exception e) {
                        log.error("读取mhtml转为excel后的xlsx文件执行出错!", e.getMessage());
                        throw new Exception(e);
                    }
                }
            } else {
                log.error("Exited with error code :{} ", exitCode);
            }

        } catch (Exception e) {
            log.error("mhtml转为excel的shell脚本执行出错!", e.getMessage());
            throw new Exception(e);
        }
        return new AsyncResult<>("success");
    }

    @Override
    public ConversionType getType() {
        return ConversionType.CONVERSION_MHTML2EXCEL;
    }
}

3.附件ps脚本

param($s,$d)
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false
$workbook = $excel.Workbooks.Open($s)
$workbook.SaveAs($d, 51) # 51 represents Excel Workbook format (.xlsx)
$workbook.Close()
$excel.Quit()
[System.Runtime.Interopservices.Marshal]::ReleaseComObject($workbook) 
[System.Runtime.Interopservices.Marshal]::ReleaseComObject($excel) 
Remove-Variable excel,workbook