JAVA读取（DOC、DOCX、PDF、PPT、PPTX）文件文本内容及图片

青葱年少 • 2023年12月26日下午5:23 • IT • 阅读 36

目录

以下为瞎扯淡：

温馨提示：有很多方法均可以解析这些常见的文件，以下内容使用的是apache-poi + apache-pdfbox实现的。

关于文档解析，在网上搜索了很久，无奈内容太过繁杂，找不到合适的代码，一大半都是只支持文本。没办法，只能自己在网上一点一点CV了，最终提取了这些代码，不能说好用吧，应该可解燃眉之急。关于doc文档以及pdf文档还是有很多问题的，后续希望大佬们能在帖子下面多多指正，能优化一下代码，那就更好了。

以下为正文内容：

首先把以下这些依赖干进去

        <dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>4.1.0</version>
		</dependency>
        <dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>4.1.0</version>
		</dependency>
        <dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>pdfbox</artifactId>
			<version>2.0.22</version>
		</dependency>

要测试的话给你贴一个文档地址吧：（但是这个在线文档是没有图片滴）

public static void main(String[] args) throws IOException {
        String document = processDocumentFromFilePath("E:\\VPN系统使用手册.pptx", "E:\\临时图片");
        System.out.println(document);
        String documentFromUrl = processDocumentFromUrl("http://api.idocv.com/data/doc/manual.docx", "E:\\临时图片");
        System.out.println(documentFromUrl);
    }

然后上车：飕飕飕

import com.alibaba.dubbo.common.utils.CollectionUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.*;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.sl.usermodel.TextParagraph;
import org.apache.poi.xslf.usermodel.*;
import org.apache.poi.xwpf.usermodel.*;

import java.io.*;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;

public class FileProcessorUtils {

   
    /***
     * 此方法针对本地文件
     * 提取文件信息并返回内容
     * @param filePath 文件储存地址
     * @param imgRoot 图片存储地址
     * @return
     */
    public static String processDocumentFromFilePath(String filePath,String imgRoot) throws IOException {
        File file = new File(filePath);
        FileInputStream fileInputStream = new FileInputStream(file);

        // 根据文件类型调用适当的处理方法
        switch (fileTypeName(filePath)) {
            case "doc":
                return processWordDocDocumentFromStream(fileInputStream,imgRoot);
            case "docx":
                return processWordDocxDocumentFromStream(fileInputStream,imgRoot);
            case "pdf":
                return processPdfDocumentFromStream(fileInputStream,imgRoot);
            case "ppt":
                return processPptDocumentFromStream(fileInputStream,imgRoot);
            case "pptx":
                return processPptxDocumentFromStream(fileInputStream,imgRoot);
            default:
                throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)");
        }
    }


    /***
     * 此方法针对网络文件
     * 提取文件信息并返回内容
     * @param downloadUrl 文件下载链接
     * @param imgRoot 图片存储地址
     * @return
     */
    public static String processDocumentFromUrl(String downloadUrl,String imgRoot) throws IOException {

        HttpClient httpClient = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(downloadUrl);
        HttpResponse response = httpClient.execute(httpGet);

        //获取文件类型
        // TODO: 2023/9/14  此处并不是所有的下载链接都存在后缀信息,如果为了提升代码的健壮性，可以在此处修改代码以获取文件类型
        String typeName = fileTypeName(downloadUrl);
        // 根据文件类型调用适当的处理方法
        switch (typeName) {
            case "doc":
                return processWordDocDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "docx":
                return processWordDocxDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "pdf":
                return processPdfDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "ppt":
                return processPptDocumentFromStream(response.getEntity().getContent(),imgRoot);
            case "pptx":
                return processPptxDocumentFromStream(response.getEntity().getContent(),imgRoot);
            default:
                throw new RuntimeException("不支持的文件格式,文件解析目前只支持(DOC/DOCX/PDF/PPT/PPTX)");
        }
    }

    /***
     * word(doc)文件处理
     * @param inputStream(文件流)
     * @return
     */
    private static String processWordDocDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        HWPFDocument document = new HWPFDocument(inputStream);
        StringBuilder htmlText = new StringBuilder();
        WordExtractor extractor = new WordExtractor(document);

        try {
            String[] paragraphs = extractor.getParagraphText();
            for (int paragraphIndex = 0; paragraphIndex < paragraphs.length; paragraphIndex++) {
                String paragraphText = paragraphs[paragraphIndex];
                //获取文本对齐方式
                String justification = getJustification(document.getRange().getParagraph(paragraphIndex).getJustification());
                // 根据需要添加其他HTML标签
                htmlText.append("<p style='text-align:").append(justification).append("'><span>").append(paragraphText).append("</span>").append("</p>");
            }

            // 提取图片
            List<Picture> pictures = document.getPicturesTable().getAllPictures();
            for (int i = 0; i < pictures.size(); i++) {
                Picture picture = pictures.get(i);
                byte[] pictureData = picture.getContent();
                String newFileName = new Date().getTime() + i + "_image." + picture.suggestFileExtension(); // 可以根据需要更改扩展名,suggestFileExtension()方法自动获取合适的图片类型
                String imgPath = saveImageToFile(pictureData, newFileName, imageRoot);
                htmlText.append("<p><img alt='' src='").append(imgPath).append("'></p >");
            }
        } finally {
            extractor.close();
            document.close();
        }
        return htmlText.toString();
    }


    /***
     * word(docx)文件处理
     * @param inputStream(文件流)
     * @return
     */
    private static String processWordDocxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        //获取文件内容
        XWPFDocument document = new XWPFDocument(inputStream);
        StringBuilder htmlText = new StringBuilder();
        try {
            //获取所有元素
            List<XWPFParagraph> paragraphs = document.getParagraphs();
            //根据元素类型追加
            for (XWPFParagraph paragraph : paragraphs) {
                //获取文本对齐方式
                ParagraphAlignment alignment = paragraph.getAlignment();
                htmlText.append("<p style='text-align:").append(alignment).append("'>");

                List<XWPFRun> runs = paragraph.getRuns();
                for (XWPFRun run : runs) {
                    // 处理字体大小、样式等信息
                    String fontSize = run.getFontSize() + "pt";
                    String fontFamily = run.getFontFamily();
                    // 添加样式信息到HTML
                    htmlText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily +";'>" + run.text() + "</span>");
                }
                htmlText.append("</p>");

                // 检查当前行段落是否有图片存在
                List<XWPFPicture> pictures = paragraph.getRuns().stream()
                        .flatMap(run -> run.getEmbeddedPictures().stream())
                        .collect(Collectors.toList());
                if(CollectionUtils.isNotEmpty(pictures)){
                    if(pictures.size()>0){
                        pictures.forEach( bean ->{
                            XWPFPictureData pictureData = bean.getPictureData();
                            String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension();
                            String imgPath = null;
                            try {
                                imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);
                            } catch (IOException e) {
                                throw new RuntimeException(e);
                            }
                            htmlText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>");
                        });
                    }
                }
            }
        } finally {
            document.close();
        }
        return htmlText.toString();
    }

    /***
     * Pdf文件处理
     * @param inputStream(文件流)
     * @return
     */
    private static String processPdfDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        PDDocument pdfDocument = PDDocument.load(inputStream);
        PDFTextStripper textStripper = new PDFTextStripper();

        StringBuilder htmlText = new StringBuilder();

        String[] lines = textStripper.getText(pdfDocument).split("\n");

        for (String line : lines) {
            htmlText.append("<p style='text-align:left'>").append(line).append("</p>");
        }

        pdfDocument.close();

        return htmlText.toString();
    }

    /**
     * 处理PPT（.ppt）文件
     * @param inputStream（文件流）
     * @return
     * @throws IOException
     */
    private static String processPptDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        HSLFSlideShow ppt = new HSLFSlideShow(inputStream);
        StringBuilder pptText = new StringBuilder();
        try {
            // 提取文本内容
            for (HSLFSlide slide : ppt.getSlides()) {
                for (HSLFShape shape : slide.getShapes()) {
                    //如果是文本处理文本
                    if (shape instanceof HSLFTextShape) {
                        HSLFTextShape textShape = (HSLFTextShape) shape;
                        for (HSLFTextParagraph paragraph : textShape.getTextParagraphs()) {
                            //获取文本对齐方式
                            TextParagraph.TextAlign textAlign = paragraph.getTextAlign();
                            pptText.append("<p style='text-align:").append(textAlign).append("'>");
                            for (HSLFTextRun run : paragraph.getTextRuns()) {
                                // 处理字体大小、字体样式等信息
                                String fontSize = run.getFontSize() + "pt";
                                String fontFamily = run.getFontFamily();
                                run.getRawText();
                                // 添加样式信息到HTML
                                pptText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily + ";'>" + run.getRawText() + "</span>");
                            }
                            pptText.append("</p>"); // 换行处理


                        }
                    }else if (shape instanceof HSLFPictureShape) {
                        // 如果是图片，处理图片
                        HSLFPictureShape pictureShape = (HSLFPictureShape) shape;
                        HSLFPictureData pictureData = pictureShape.getPictureData();
                        String contentType = pictureData.getContentType();
                        String newFileName = new Date().getTime() + "_image." + imageTypeName(contentType);
                        String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);
                        pptText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>");
                    }
                }
            }
        } finally {
            ppt.close();
        }

        return pptText.toString();
    }

    /**
     * 处理PPTX（.pptx）文件
     * @param inputStream（文件流）
     * @return
     * @throws IOException
     */
    private static String processPptxDocumentFromStream(InputStream inputStream,String imageRoot) throws IOException {
        XMLSlideShow pptx = new XMLSlideShow(inputStream);
        StringBuilder pptxText = new StringBuilder();
        try {
            // 提取文本内容
            for (XSLFSlide slide : pptx.getSlides()) {
                for (XSLFShape shape : slide.getShapes()) {
                    if (shape instanceof XSLFTextShape) {
                        XSLFTextShape textShape = (XSLFTextShape) shape;
                        for (XSLFTextParagraph paragraph : textShape.getTextParagraphs()) {
                            //获取文本对齐方式
                            TextParagraph.TextAlign textAlign = paragraph.getTextAlign();
                            pptxText.append("<p style='text-align:").append(textAlign).append("'>");
                            for (XSLFTextRun run : paragraph.getTextRuns()) {
                                // 处理字体大小、字体样式等信息
                                String fontSize = run.getFontSize() + "pt";
                                String fontFamily = run.getFontFamily();

                                // 添加样式信息到HTML
                                pptxText.append("<span style='font-size:" + fontSize + "; font-family:" + fontFamily + ";'>" + run.getRawText() + "</span>");
                            }
                            pptxText.append("</p>"); // 换行处理
                        }
                    }else if (shape instanceof XSLFPictureShape) {
                        // 如果是图片，处理图片
                        XSLFPictureShape pictureShape = (XSLFPictureShape) shape;
                        XSLFPictureData pictureData = pictureShape.getPictureData();
                        String newFileName = new Date().getTime() + "_image." + pictureData.suggestFileExtension();
                        String imgPath = saveImageToFile(pictureData.getData(), newFileName, imageRoot);
                        pptxText.append("<p style='text-align:center'><img src='").append(imgPath).append("'></p>");
                    }
                }
            }
        } finally {
            pptx.close();
        }
        return pptxText.toString();
    }

    /**
     * 保存图片到指定位置，并返回引用地址
     * @param imageData
     * @param imageRoot
     * @return
     * @throws IOException
     */
    public static String saveImageToFile(byte[] imageData, String imageFileName, String imageRoot) throws IOException {
        String imagePath = imageRoot + File.separator + imageFileName;
        File file = new File(imageRoot);
        if(!file.exists()){
            file.mkdir();
        }
        try (FileOutputStream fos = new FileOutputStream(imagePath)) {
            fos.write(imageData);
        }
        return imagePath;
    }

    /**
     * 表格处理
     * @param table
     * @return
     */
    private static String getTableHtmlText(XWPFTable table) {
        StringBuilder tableHtml = new StringBuilder("<table>");
        for (XWPFTableRow row : table.getRows()) {
            tableHtml.append("<tr>");
            for (XWPFTableCell cell : row.getTableCells()) {
                tableHtml.append("<td>").append(cell.getText()).append("</td>");
            }
            tableHtml.append("</tr>");
        }
        tableHtml.append("</table>");
        return tableHtml.toString();
    }

    /***
     * 获取文件后缀
     * @param filePath
     * @return
     */
    private static String fileTypeName(String filePath) {
        int dotIndex = filePath.lastIndexOf(".");
        if (dotIndex > 0) {
            return filePath.substring(dotIndex + 1).toLowerCase();
        }
        return "";
    }

    /***
     * 获取图片类型
     * @param imagePath
     * @return
     */
    private static String imageTypeName(String imagePath) {
        int dotIndex = imagePath.lastIndexOf("/");
        if (dotIndex > 0) {
            return imagePath.substring(dotIndex + 1).toLowerCase();
        }
        return "";
    }

    /***
     * doc文档获取当前行对齐方式 默认左对齐
     * @param type
     * @return
     */
    private static String getJustification(Integer type) {
        switch (type) {
            case 0:
                return "left";
            case 1:
                return "center";
            case 2:
                return "right";
            default:
                return "left";
        }
    }
}

文章出处登录后可见！

已经登录？立即刷新

赞 (0)

青葱年少普通用户

0

AIGC大模型之——以文生图介绍

上一篇 2023年12月26日

00后最关注的职业：公务员排第二，第一是？

下一篇 2023年12月26日