三种方式实现mhtml格式转为excel
pdf转word, ppt转pdf等office,pdf格式相互转换都比较容易实现,比如可以用openoffice,onlyoffice等。但是mhtml(带有MIME附件的html文件)转excel,网上一搜可能没有那么多资料,本文将介绍三种方案将mhtml转为excel。
一.使用javax.mail解析mhtml
优点:可以按照自己的需要解析指定的内容和标签;
缺点:解析时整个文件是一次性加载到内存中,内存占用较大;需要自己解析指定的内容;
1.maven依赖
<dependency>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
<version>1.4.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
2.代码示例
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Properties;
import javax.mail.BodyPart;
import javax.mail.Multipart;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* description:MHTMLReader
*
* @author: lgq
* @create: 2024-05-16 15:36
*/
public class MHTMLReader {
public static void main(String[] args) {
for (int i = 0; i< 100;i++) {
int random = (int)(Math.random()*(11));
System.out.println(random);
}
try {
File mhtmlFile = new File("C:\\Users\\lgq\\Downloads\\003-r.mhtml");
InputStream inputStream = new FileInputStream(mhtmlFile);
MimeMessage mimeMessage = new MimeMessage(Session.getDefaultInstance(new Properties(), null), inputStream);
// 解析MimeMessage对象,获取所需信息
Object content = mimeMessage.getContent();
if (content instanceof Multipart) {
Multipart multipart = (Multipart) content;
BodyPart bodyPart = multipart.getBodyPart(0);
Object content1 = bodyPart.getContent();
//获取内容
String s = content1.toString();
Document document = Jsoup.parse(s);
Elements elements = document.select("tr");
for (Element element : elements) {
String text = element.text();
System.out.println(text);
}
}
} catch (Exception e ) {
e.printStackTrace();
}
}
}
二.使用aspose解析mhtml
优点:可以直接将mhtml格式转为其他格式,比如xlsx;
缺点:内存占用非常大,多并发情况下很容易造成服务崩溃;
1.maven依赖
<dependency>
<groupId>com.aspose</groupId>
<artifactId>aspose-cells</artifactId>
<version>20.7</version>
<scope>system</scope>
<systemPath>${project.basedir}/src/main/resources/lib/aspose-cells-20.7-crack.jar</systemPath>
</dependency>
2.代码示例
package com.abc.def.service.impl;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.concurrent.Future;
import javax.annotation.PostConstruct;
import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.ExcelWriter;
import com.alibaba.excel.support.ExcelTypeEnum;
import com.alibaba.excel.write.metadata.WriteSheet;
import com.aspose.cells.License;
import com.aspose.cells.LoadFormat;
import com.aspose.cells.LoadOptions;
import com.aspose.cells.MemorySetting;
import com.aspose.cells.Workbook;
import com.aspose.cells.Worksheet;
import com.abc.def.common.utils.EasyExcelUtil;
import com.abc.def.listener.EasyExcelGeneralCsvListener;
import com.abc.def.service.ExcelAnalysisService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.AsyncResult;
import org.springframework.stereotype.Service;
/**
* description:ExcelAnalysisService实现类
*
* @author: lgq
* @create: 2024-04-17 14:53
*/
@Service
@Slf4j
public class ExcelAnalysisServiceImpl implements ExcelAnalysisService {
@Async("asyncExcelAnalysisServiceExecutor")
@Override
public Future<String> csv2Excel(InputStream inputStream, OutputStream outputStream, String sheetName, boolean isGzip) {
try {
ExcelWriter writer = EasyExcel.write(outputStream).excelType(ExcelTypeEnum.XLSX).build();
EasyExcel.read(inputStream, new EasyExcelGeneralCsvListener(writer, new WriteSheet()))
.excelType(ExcelTypeEnum.CSV)
.charset(Charset.forName("UTF-8"))
.sheet()
.sheetNo(0)
.sheetName(sheetName)
.doRead();
writer.finish();
outputStream.flush();
} catch (Exception e) {
log.error("csv转为excel出错!", e.getMessage());
e.printStackTrace();
} finally {
if (outputStream != null) {
try {
outputStream.close();
} catch (Exception e) {
log.error("outputStream.close() -> csv转为excel出错!", e.getMessage());
e.printStackTrace();
}
}
if (inputStream != null) {
try {
inputStream.close();
} catch (Exception e) {
log.error("inputStream.close() -> csv转为excel出错!", e.getMessage());
e.printStackTrace();
}
}
}
return new AsyncResult<>("task complete!");
}
@Async("asyncExcelAnalysisServiceExecutor")
@Override
public Future<String> mhtmlAsExcel(InputStream inputStream, OutputStream outputStream, String sheetName, boolean isGzip) {
try {
LoadOptions lo = new LoadOptions(LoadFormat.M_HTML); // 设置加载选项
lo.setMemorySetting(MemorySetting.MEMORY_PREFERENCE);
Workbook workbook = new Workbook(inputStream, lo);
workbook.getWorksheets().get(0).setName(sheetName); // 设置工作簿的名称
workbook.getWorksheets().get(0).setGridlinesVisible(true); // 显示网格线
workbook.getWorksheets().get(0).autoFitColumns();// 设置自适应列宽
workbook.getWorksheets().get(0).autoFitRows();// 设置自适应行高
workbook.save(outputStream, LoadFormat.XLSX);
} catch (Exception ioException) {
log.error("workbook.write() -> 另存为excel出错!", ioException.getMessage());
ioException.printStackTrace();
} finally {
if (outputStream != null) {
try {
outputStream.close();
} catch (Exception e) {
log.error("outputStream.close() -> csv转为excel出错!", e.getMessage());
e.printStackTrace();
}
}
if (inputStream != null) {
try {
inputStream.close();
} catch (Exception e) {
log.error("inputStream.close() -> csv转为excel出错!", e.getMessage());
e.printStackTrace();
}
}
}
return new AsyncResult<>("task complete!");
}
@PostConstruct
public void initAspose() {
try (InputStream is = com.aspose.cells.License.class.getResourceAsStream("/com.aspose.cells.lic_2999.xml")) {
License asposeLicense = new License();
asposeLicense.setLicense(is);
} catch (Exception e) {
log.error("initAspose -> 初始化加载license文件失败!", e.getMessage());
e.printStackTrace();
}
}
}
3.aspose jar包参考
从gitee上找到了一位大佬的学习实践教程,作者研究了aspose.cells的 20.7 的版本,距离最新版本发布相距 3 年多近 40 个Release版本,可参考:https://gitee.com/evilrule/crack-aspose,有需要旧版的可以移步查阅和实践,确实可以把《Evaluation Warning》警告去除;但是不可以进行商用。
三.使用windows的office.exe解析mhtml(模拟打开另存为xlsx操作)
优点:把mhtml格式转换转发给office软件执行,java服务没有内存压力;
缺点:为了更换支持格式转换,一般需要部署在windows服务器上;
1.maven依赖
无。
2.代码示例
package com.abc.def.handler;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import javax.servlet.ServletOutputStream;
import javax.servlet.http.HttpServletResponse;
import com.abc.def.common.utils.EasyExcelUtil;
import com.abc.def.common.utils.ObjectUtil;
import com.abc.def.pojo.enums.ConversionType;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.AsyncResult;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import static com.abc.def.common.utils.EasyExcelUtil.gzFileExtractor;
import static com.abc.def.pojo.enums.EpDownloadErrorCode.FILE_TOO_LARGE;
import static com.abc.def.pojo.enums.EpDownloadErrorCode.FILE_TYPE_ERROR;
import static com.abc.def.pojo.enums.EpDownloadErrorCode.FREQUENCY_TOO_HIGH_ERROR;
/**
* description:mhtml转为excel
*
* @author: lgq
* @create: 2024-05-17 11:27
*/
@Slf4j
@Component("mhtml2ExcelConversion")
public class Mhtml2ExcelConversion implements ConversionFunction {
private static final String PS1_PATH = "D:\\ep\\mhtml2excel1.ps1";
@Override
public String converse(MultipartFile file, HttpServletResponse response) {
String filename = file.getOriginalFilename();
String fileType = "MHTML";
String[] splitName = filename.split(".xls");
if (ObjectUtil.isEmpty(splitName) || ObjectUtil.isEmpty(splitName[0])) {
String errorMsg = "文件格式不对,请检查格式!";
EasyExcelUtil.setResponseParam(response, "Process-Result", String.valueOf(FILE_TYPE_ERROR.getCode()));
EasyExcelUtil.setResponseParam(response, "Content-Type", "text/plain");
return errorMsg;
}
String pureFilename = splitName[0];
EasyExcelUtil.setResponseParam(response, pureFilename);
File tempFile = null;
String targetFilePath = null;
InputStream inputStream = null;
try {
inputStream = file.getInputStream();
String tempFilePath = gzFileExtractor(inputStream);
if (!ObjectUtil.isEmpty(tempFilePath) && "false".equals(tempFilePath)) {
tempFilePath = UUID.randomUUID().toString();
tempFile = new File(tempFilePath);
if (!tempFile.exists()) {
tempFile.createNewFile();
}
tempFilePath = tempFile.getAbsolutePath();
try (InputStream inputStream1 = file.getInputStream()) {
Files.copy(inputStream1, tempFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
} catch (Exception e) {
log.error("MultipartFile 转为 File 失败!(mhtml)", e.getMessage());
e.printStackTrace();
}
} else {
tempFile = new File(tempFilePath);
}
long fileLength = tempFile.length();
log.info("the length of origin file is {} ", fileLength); //172460518
if (fileLength > 100000000) {
String errorMsg = "导出excel文件超过128M,请缩小查询范围后再进行导出!";
EasyExcelUtil.setResponseParam(response, "Process-Result", String.valueOf(FILE_TOO_LARGE.getCode()));
EasyExcelUtil.setResponseParam(response, "Content-Type", "text/plain");
return errorMsg;
}
targetFilePath = tempFilePath + ".xlsx";
Future<String> future = mhtmlAsExcel(tempFilePath, targetFilePath, response);
return future.get();
} catch (IOException ioException) {
log.error("{}转为excel出错!", fileType, ioException.getMessage());
ioException.printStackTrace();
} catch (InterruptedException interruptedException) {
log.error("{}转为excel出错!", fileType, interruptedException.getMessage());
interruptedException.printStackTrace();
} catch (ExecutionException executionException) {
log.error("{}转为excel出错!", fileType, executionException.getMessage());
executionException.printStackTrace();
} catch (RuntimeException x) {
log.error("{}转为excel出错!", fileType, x.getMessage());
x.printStackTrace();
} catch (Error x) {
log.error("{}转为excel出错!", fileType, x.getMessage());
x.printStackTrace();
} catch (Throwable x) {
log.error("{}转为excel出错!", fileType, x.getMessage());
x.printStackTrace();
} finally {
if (tempFile != null && tempFile.delete()) {
log.info("原始文件{}删除成功!", tempFile.getName());
} else if (tempFile != null) {
log.error("原始文件{}删除失败!", tempFile.getName());
}
try {
if (!ObjectUtil.isEmpty(targetFilePath)) {
File targetFile = new File(targetFilePath);
if (targetFile.exists() && targetFile.delete()) {
log.info("转换后的文件{}删除成功!", targetFile.getName());
} else if (targetFile.exists()) {
log.error("转换后的文件{}删除失败!", targetFile.getName());
}
}
} catch (Exception e) {
log.error("targetFile.delete() -> 删除转换后的文件出错!", e.getMessage());
e.printStackTrace();
}
if (inputStream != null) {
try {
inputStream.close();
} catch (Exception e) {
log.error("inputStream.close() -> csv转为excel出错!", e.getMessage());
e.printStackTrace();
}
}
}
return "success";
}
@Async("asyncExcelAnalysisServiceExecutor")
public Future<String> mhtmlAsExcel(String originFileName, String targetFileName, HttpServletResponse response)
throws Exception {
String powershellCommand = "powershell.exe "+ PS1_PATH +" " + originFileName + " " + targetFileName;
try {
// 定义PowerShell 命令
String baseShellCommand = "Get-Process | Select-Object -First 5";
// 创建ProcessBuilder 实例
ProcessBuilder processBuilder = new ProcessBuilder("powershell", "-Command", powershellCommand);
//启动进程
Process process = processBuilder.start();
BufferedReader reader = new BufferedReader(new InputStreamReader(process.getErrorStream(), Charset.forName("GBK")));
String line;
int errorLogLength = 0;
while ((line = reader.readLine()) != null) {
log.error(line);
errorLogLength += line.length();
break;
}
if (errorLogLength > 0) {
String errorMsg = "mhtml转换为excel过于频繁,请稍后再试!";
log.error(errorMsg);
process.destroy();
EasyExcelUtil.setResponseParam(response, "Process-Result", String.valueOf(FREQUENCY_TOO_HIGH_ERROR.getCode()));
EasyExcelUtil.setResponseParam(response, "Content-Type", "text/plain");
return new AsyncResult<>(errorMsg);
}
//等待进程结束并获取退出码
int exitCode = process.waitFor();
if (exitCode == 0) {
log.info("Exited with error code :{} ", exitCode);
File file = new File(targetFileName);
if (file.exists()) {
// 读取本地文件
try (InputStream inputStream = new BufferedInputStream(new FileInputStream(file));
ServletOutputStream outputStream = response.getOutputStream()) {
byte[] buffer = new byte[1024];
int bytesRead;
// 将文件内容写入响应
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.flush();
} catch (Exception e) {
log.error("读取mhtml转为excel后的xlsx文件执行出错!", e.getMessage());
throw new Exception(e);
}
}
} else {
log.error("Exited with error code :{} ", exitCode);
}
} catch (Exception e) {
log.error("mhtml转为excel的shell脚本执行出错!", e.getMessage());
throw new Exception(e);
}
return new AsyncResult<>("success");
}
@Override
public ConversionType getType() {
return ConversionType.CONVERSION_MHTML2EXCEL;
}
}
3.附件ps脚本
param($s,$d)
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false
$workbook = $excel.Workbooks.Open($s)
$workbook.SaveAs($d, 51) # 51 represents Excel Workbook format (.xlsx)
$workbook.Close()
$excel.Quit()
[System.Runtime.Interopservices.Marshal]::ReleaseComObject($workbook)
[System.Runtime.Interopservices.Marshal]::ReleaseComObject($excel)
Remove-Variable excel,workbook