一键采集微信公众号文章

demo

最近经常阅读微信公众号的文章,发现有很多优秀的技术文章,每每转发给自己保存想后续再看的时候,回头已经发现被删除了,又或者是因为转发了其他的内容,又或者是多设备登录导致转发的聊天记录消失了,所以为了避免以上问题。掏出了jsoup进行了一番研究之后,决定把微信公众号的文章直接爬到本地收藏观看,废话不多说直接上代码。

@Service
@RequiredArgsConstructor
public class AsyncService {

    private final Pattern pattern = Pattern.compile("<img.*?data-src=\"(.*?)\".*?>");
    private final ServiceConfig serviceConfig;
    private final MpPostsDaoService mpPostsDaoService;

    @Async
    public void spiderMpPostUrl(String url) {
        this.spiderPost(url);
    }

    private void spiderPost(String url) {
        MpPostSpiderResultVO resultVO = new MpPostSpiderResultVO();
        resultVO.setOriginUrl(url);
        // 判断是否已采集过
        MpPosts mpPostsData = this.mpPostsDaoService.lambdaQuery()
                .eq(MpPosts::getOriginUrl, url)
                .one();
        if (null == mpPostsData) {
            Connection connect = Jsoup.connect(url);
            connect.timeout(10000);
            try {
                Connection.Response response = connect.execute();
                String body = response.body();
                if (StrUtil.isEmpty(response.body())) {
                    throw new ToastException("获取网页内容失败");
                }
                Document document = Jsoup.parse(body);

                Node node = Objects.requireNonNull(document.getElementById("activity-name")).childNode(0);
                // 获取标题
                String title = node.attr("#text");
                resultVO.setTitle(title.replaceAll("\\s+", ""));
                // 获取作者
                Element jsName = document.getElementById("js_name");
                if (null != jsName) {
                    resultVO.setAuthor(jsName.text());
                }
                // 获取发布日期
                for (Element o : document.select("script")) {
                    String html = o.html();
                    if (html.contains("var createTime =")) {
                        int index = html.indexOf("var createTime = '");
                        if (index != -1) {
                            String substring = html.substring(index);
                            String substring1 = substring.substring(0, substring.indexOf("';"));
                            String[] split = substring1.split("'");
                            resultVO.setPostDate(DateUtil.parse(split[1] + ":00"));
                            break;
                        }
                    }
                }
                // 获取文章内容
                Element contentNode = document.getElementsByClass("rich_media_content").get(0);
                contentNode.getElementsByClass("mp_profile_iframe_wrp").remove();
                Elements contentNodes = contentNode.select("*");
                StringBuffer content = new StringBuffer();
                HashMap<String, String> imagesMap = new HashMap<>();
                contentNodes.get(0).childNodes().forEach(
                        imagesNode -> {
                            Matcher matcher = this.pattern.matcher(imagesNode.toString());
                            String postContent = imagesNode.toString();
                            while (matcher.find()) {
                                String attr = matcher.group(1);
                                String[] split = attr.split("/");
                                String patch = split[3];
                                String fileName = split[4];
                                String suffix = split[5].split("=")[1].split("&")[0];
                                if ("other".equals(suffix)) {
                                    suffix = "jpg";
                                }
                                String originPath = "/mp" + "/" + patch;
                                String savePath = this.serviceConfig.getStaticResources().getRootPath() + "/wechat" + originPath;
                                String fileSaveName = fileName + "." + suffix;
                                String fileSavePath = savePath + "/" + fileSaveName;
                                CacheImageUtil.refreshImages(attr, fileSavePath, savePath, null);
                                String dbFilePath = originPath + "/" + fileSaveName;
                                imagesMap.put(attr, dbFilePath);
                                postContent = postContent.replaceAll(attr.replaceAll("\\?", "\\\\?"), dbFilePath);
                            }
                            content.append(postContent.replaceAll("data-src", "src"));
                        }
                );
                resultVO.setContent(content.toString());
                Map<Object, Object> resourceUrlMap = new HashMap<>(1, 1.1F);
                resourceUrlMap.put("images", imagesMap);
                resultVO.setResourceUrl(resourceUrlMap);
                MpPosts mpPosts = CommonBeanUtil.copyBean(resultVO, MpPosts.class);
                this.mpPostsDaoService.save(mpPosts);
            } catch (Exception e) {
                throw new ToastException("文章采集失败:{}" + e.getMessage(), e);
            }
        }
    }
}

以上代码仅供参考

Comments: 3

「人生在世,留句话给我吧」

提交评论