1,这段代码的功能:

(1)可以将指定的html代码中的src路径提取出来

(2)将提取出来的src路径重新下载到本地

package com.googosoft.until;

import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.Test;

public class HtmlUtil {

    public static String delHTMLTag(String htmlStr) {
        String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式
        String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式
        String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式

        Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
        Matcher m_script = p_script.matcher(htmlStr);
        htmlStr = m_script.replaceAll(""); // 过滤script标签

        Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
        Matcher m_style = p_style.matcher(htmlStr);
        htmlStr = m_style.replaceAll(""); // 过滤style标签

        Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
        Matcher m_html = p_html.matcher(htmlStr);
        htmlStr = m_html.replaceAll(""); // 过滤html标签
        return htmlStr.trim(); // 返回文本字符串
    }

    /**
     * 根据图片的网络路径将图片下载到本地,并返回本地路径
     * @param urlHttp 图片的网络路径
     * @param path 新生成的图片的目录
     * @return
     */
    private static String getPicture2(String urlHttp, String path) {
        FileOutputStream out = null;
        BufferedInputStream in = null;
        HttpURLConnection connection = null;
        String newPath = "";

        byte[] buf = new byte[1024];
        int len = 0;
        try {
            URL url = new URL(urlHttp);
            connection = (HttpURLConnection) url.openConnection();
            connection.connect();
            in = new BufferedInputStream(connection.getInputStream());
            newPath = path + "/" + new Date().getTime() + ".jpg";
            out = new FileOutputStream(newPath);
            while ((len = in.read(buf)) != -1) {
                out.write(buf, 0, len);
            }
            out.flush();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                in.close();
                out.close();
                connection.disconnect();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return newPath;
    }

    /**
     * 提取HTML字符串中的img列表
     * @param htmlStr 要处理的html字符串
     * @return
     */
    private static List<String> getImgStrList(String htmlStr) {
        List<String> list = new ArrayList<>();
        String img = "";
        Pattern p_image;
        Matcher m_image;
        String regEx_img = "<img.*src\\s*=\\s*(.*?)[^>]*?>";
        p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
        m_image = p_image.matcher(htmlStr);
        while (m_image.find()) {
            img = m_image.group();
            Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
            while (m.find()) {
                list.add(handleSrc(m.group(1)));
            }
        }
        return list;
    }

    /**
     * 去除src路径中的前后单引号
     * @param src 图片的src路径
     * @return
     */
    private static String handleSrc(String src) {
        if (src != null) {
            if (src.startsWith("\'")) {
                return src.substring(1, src.length());
            }
            if (src.endsWith("\'")) {
                return src.substring(0, src.length());
            }

        }
        return src;
    }

    @Test
    public void testTransSrc() throws Exception {
        String str = "<h1 style=\'font-weight: 400; padding-left: 0px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); font-size: 24px; line-height: 36px; color: rgb(0, 0, 0); font-family: 微软雅黑; text-align: center;\'>标题</h1><div class=\'detail-body photos\' style=\'margin-top: 20px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); min-height: 306px; line-height: 26px; font-size: 16px; color: rgb(51, 51, 51); overflow-wrap: break-word; font-family: 微软雅黑;\'><pre id=\'content\' style=\'padding-right: 15px; padding-left: 15px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); white-space: pre-wrap; overflow-wrap: break-word; border-left-color: rgb(0, 150, 136); background-color: rgb(248, 248, 248); overflow: auto;\'><div class=\'entry\' style=\'margin-top: 30px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); border: 0px; outline: 0px; font-family: Arial, &quot;Microsoft YaHei&quot;, 微软雅黑, STHeiti, &quot;WenQuanYi Micro Hei&quot;, SimSun, sans-serif; min-height: 450px; color: rgb(61, 70, 77);\'><p style=\'margin-top: 5px; margin-bottom: 20px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); border: 0px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; line-height: 1.8; word-break: break-all; font-size: 16px; letter-spacing: 0px;\'><img class=\'\' src=\'https://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGLGbtyLtqR250PIBYM5GgGxHkrPdAv8yaAe1vCEvxIBeiaB0ibr8bWVBg/640?wx_fmt=jpeg&amp;tp=webp&amp;wxfrom=5&amp;wx_lazy=1&amp;wx_co=1\' crossorigin=\'anonymous\' data-croporisrc=\'http://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGd2DN46qD8MwKvRgjEZnwu3n47tHQxRnqw3snRDsvccFL6cjTOjDGXw/0?wx_fmt=jpeg\' data-cropx1=\'0\' data-cropx2=\'434\' data-cropy1=\'0\' data-cropy2=\'393\' data-ratio=\'0.9032258064516129\' data-src=\'https://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGLGbtyLtqR250PIBYM5GgGxHkrPdAv8yaAe1vCEvxIBeiaB0ibr8bWVBg/640?wx_fmt=jpeg\' data-type=\'jpeg\' data-w=\'434\' data-fail=\'0\' style=\'display: block; max-width: 750px; cursor: pointer; margin-top: 5px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; height: auto;\'><img class=\'size-full wp-image-28403 aligncenter\' src=\'http://www.yunweipai.com/wp-content/uploads/2019/04/20190425172338.jpg\' alt=\'\' width=\'434\' height=\'392\' style=\'display: block; max-width: 750px; cursor: pointer; margin: 5px auto 10px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; height: auto;\'></p></div></pre></div>";
        List<String> imgList = getImgStrList(str);
        for (String img : imgList) {
            System.out.println(getPicture2(img, "D://uploadFiles"));
        }
    }
    

}

 

版权声明:本文为excellencesy原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/excellencesy/p/11925914.html