JAVA-替换html中图片的路径-从html代码中提取图片路径并下载
1,这段代码的功能:
(1)可以将指定的html代码中的src路径提取出来
(2)将提取出来的src路径重新下载到本地
package com.googosoft.until; import java.io.BufferedInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.junit.Test; public class HtmlUtil { public static String delHTMLTag(String htmlStr) { String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式 String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式 String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 过滤script标签 Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); // 过滤style标签 Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 过滤html标签 return htmlStr.trim(); // 返回文本字符串 } /** * 根据图片的网络路径将图片下载到本地,并返回本地路径 * @param urlHttp 图片的网络路径 * @param path 新生成的图片的目录 * @return */ private static String getPicture2(String urlHttp, String path) { FileOutputStream out = null; BufferedInputStream in = null; HttpURLConnection connection = null; String newPath = ""; byte[] buf = new byte[1024]; int len = 0; try { URL url = new URL(urlHttp); connection = (HttpURLConnection) url.openConnection(); connection.connect(); in = new BufferedInputStream(connection.getInputStream()); newPath = path + "/" + new Date().getTime() + ".jpg"; out = new FileOutputStream(newPath); while ((len = in.read(buf)) != -1) { out.write(buf, 0, len); } out.flush(); } catch (Exception e) { e.printStackTrace(); } finally { try { in.close(); out.close(); connection.disconnect(); } catch (IOException e) { e.printStackTrace(); } } return newPath; } /** * 提取HTML字符串中的img列表 * @param htmlStr 要处理的html字符串 * @return */ private static List<String> getImgStrList(String htmlStr) { List<String> list = new ArrayList<>(); String img = ""; Pattern p_image; Matcher m_image; String regEx_img = "<img.*src\\s*=\\s*(.*?)[^>]*?>"; p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE); m_image = p_image.matcher(htmlStr); while (m_image.find()) { img = m_image.group(); Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img); while (m.find()) { list.add(handleSrc(m.group(1))); } } return list; } /** * 去除src路径中的前后单引号 * @param src 图片的src路径 * @return */ private static String handleSrc(String src) { if (src != null) { if (src.startsWith("\'")) { return src.substring(1, src.length()); } if (src.endsWith("\'")) { return src.substring(0, src.length()); } } return src; } @Test public void testTransSrc() throws Exception { String str = "<h1 style=\'font-weight: 400; padding-left: 0px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); font-size: 24px; line-height: 36px; color: rgb(0, 0, 0); font-family: 微软雅黑; text-align: center;\'>标题</h1><div class=\'detail-body photos\' style=\'margin-top: 20px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); min-height: 306px; line-height: 26px; font-size: 16px; color: rgb(51, 51, 51); overflow-wrap: break-word; font-family: 微软雅黑;\'><pre id=\'content\' style=\'padding-right: 15px; padding-left: 15px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); white-space: pre-wrap; overflow-wrap: break-word; border-left-color: rgb(0, 150, 136); background-color: rgb(248, 248, 248); overflow: auto;\'><div class=\'entry\' style=\'margin-top: 30px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); border: 0px; outline: 0px; font-family: Arial, "Microsoft YaHei", 微软雅黑, STHeiti, "WenQuanYi Micro Hei", SimSun, sans-serif; min-height: 450px; color: rgb(61, 70, 77);\'><p style=\'margin-top: 5px; margin-bottom: 20px; -webkit-tap-highlight-color: rgba(0, 0, 0, 0); border: 0px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; line-height: 1.8; word-break: break-all; font-size: 16px; letter-spacing: 0px;\'><img class=\'\' src=\'https://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGLGbtyLtqR250PIBYM5GgGxHkrPdAv8yaAe1vCEvxIBeiaB0ibr8bWVBg/640?wx_fmt=jpeg&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1\' crossorigin=\'anonymous\' data-croporisrc=\'http://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGd2DN46qD8MwKvRgjEZnwu3n47tHQxRnqw3snRDsvccFL6cjTOjDGXw/0?wx_fmt=jpeg\' data-cropx1=\'0\' data-cropx2=\'434\' data-cropy1=\'0\' data-cropy2=\'393\' data-ratio=\'0.9032258064516129\' data-src=\'https://mmbiz.qpic.cn/mmbiz_jpg/PiaIQldYWZNPy0KlJ6MxyUJic2lAv3JQfGLGbtyLtqR250PIBYM5GgGxHkrPdAv8yaAe1vCEvxIBeiaB0ibr8bWVBg/640?wx_fmt=jpeg\' data-type=\'jpeg\' data-w=\'434\' data-fail=\'0\' style=\'display: block; max-width: 750px; cursor: pointer; margin-top: 5px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; height: auto;\'><img class=\'size-full wp-image-28403 aligncenter\' src=\'http://www.yunweipai.com/wp-content/uploads/2019/04/20190425172338.jpg\' alt=\'\' width=\'434\' height=\'392\' style=\'display: block; max-width: 750px; cursor: pointer; margin: 5px auto 10px; outline: 0px; font-weight: inherit; font-style: inherit; font-family: inherit; height: auto;\'></p></div></pre></div>"; List<String> imgList = getImgStrList(str); for (String img : imgList) { System.out.println(getPicture2(img, "D://uploadFiles")); } } }
版权声明:本文为excellencesy原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。