一键采集微信公众号文章
Post Time:2023-06-21 15:00:51 Views:2441
最近经常阅读微信公众号的文章,发现有很多优秀的技术文章,每每转发给自己保存想后续再看的时候,回头已经发现被删除了,又或者是因为转发了其他的内容,又或者是多设备登录导致转发的聊天记录消失了,所以为了避免以上问题。掏出了jsoup进行了一番研究之后,决定把微信公众号的文章直接爬到本地收藏观看,废话不多说直接上代码。
@Service
@RequiredArgsConstructor
public class AsyncService {
private final Pattern pattern = Pattern.compile("<img.*?data-src=\"(.*?)\".*?>");
private final ServiceConfig serviceConfig;
private final MpPostsDaoService mpPostsDaoService;
@Async
public void spiderMpPostUrl(String url) {
this.spiderPost(url);
}
private void spiderPost(String url) {
MpPostSpiderResultVO resultVO = new MpPostSpiderResultVO();
resultVO.setOriginUrl(url);
// 判断是否已采集过
MpPosts mpPostsData = this.mpPostsDaoService.lambdaQuery()
.eq(MpPosts::getOriginUrl, url)
.one();
if (null == mpPostsData) {
Connection connect = Jsoup.connect(url);
connect.timeout(10000);
try {
Connection.Response response = connect.execute();
String body = response.body();
if (StrUtil.isEmpty(response.body())) {
throw new ToastException("获取网页内容失败");
}
Document document = Jsoup.parse(body);
Node node = Objects.requireNonNull(document.getElementById("activity-name")).childNode(0);
// 获取标题
String title = node.attr("#text");
resultVO.setTitle(title.replaceAll("\\s+", ""));
// 获取作者
Element jsName = document.getElementById("js_name");
if (null != jsName) {
resultVO.setAuthor(jsName.text());
}
// 获取发布日期
for (Element o : document.select("script")) {
String html = o.html();
if (html.contains("var createTime =")) {
int index = html.indexOf("var createTime = '");
if (index != -1) {
String substring = html.substring(index);
String substring1 = substring.substring(0, substring.indexOf("';"));
String[] split = substring1.split("'");
resultVO.setPostDate(DateUtil.parse(split[1] + ":00"));
break;
}
}
}
// 获取文章内容
Element contentNode = document.getElementsByClass("rich_media_content").get(0);
contentNode.getElementsByClass("mp_profile_iframe_wrp").remove();
Elements contentNodes = contentNode.select("*");
StringBuffer content = new StringBuffer();
HashMap<String, String> imagesMap = new HashMap<>();
contentNodes.get(0).childNodes().forEach(
imagesNode -> {
Matcher matcher = this.pattern.matcher(imagesNode.toString());
String postContent = imagesNode.toString();
while (matcher.find()) {
String attr = matcher.group(1);
String[] split = attr.split("/");
String patch = split[3];
String fileName = split[4];
String suffix = split[5].split("=")[1].split("&")[0];
if ("other".equals(suffix)) {
suffix = "jpg";
}
String originPath = "/mp" + "/" + patch;
String savePath = this.serviceConfig.getStaticResources().getRootPath() + "/wechat" + originPath;
String fileSaveName = fileName + "." + suffix;
String fileSavePath = savePath + "/" + fileSaveName;
CacheImageUtil.refreshImages(attr, fileSavePath, savePath, null);
String dbFilePath = originPath + "/" + fileSaveName;
imagesMap.put(attr, dbFilePath);
postContent = postContent.replaceAll(attr.replaceAll("\\?", "\\\\?"), dbFilePath);
}
content.append(postContent.replaceAll("data-src", "src"));
}
);
resultVO.setContent(content.toString());
Map<Object, Object> resourceUrlMap = new HashMap<>(1, 1.1F);
resourceUrlMap.put("images", imagesMap);
resultVO.setResourceUrl(resourceUrlMap);
MpPosts mpPosts = CommonBeanUtil.copyBean(resultVO, MpPosts.class);
this.mpPostsDaoService.save(mpPosts);
} catch (Exception e) {
throw new ToastException("文章采集失败:{}" + e.getMessage(), e);
}
}
}
}
以上代码仅供参考
暂无评论
「人生在世,留句话给我吧」
QQ登录免填信息

撰写评论
本博客内所有原创和翻译的文章的版权归本人所有,允许第三方转载,但转载时请务必保留作者名,并注明出处链接,否则本人将保留追究其法律责任的权利。