掘金小册 买了n本书,但借阅卡有时间限制,看不完.想要保存下来以后阅读.
总体流程:
在config里填写:
1 需要下载的小册id(浏览器url里查看)
2 浏览器中的sessionId
Download方法:
1 找到一本小册的所有sesction,加入 goroutine,DownloadOneBook方法,创建目录,获得section下的markdown和图片列表
2 正则找到所有图片列表,加入另一个goroutine,下载保存图片,替换markdown中的图片链接为本地.
func TestDownload2Markdown(t *testing.T) {
c := Config{
Sessionid: "43680406158c713253e8bfe966d70f80",
BookIDs: []string{
//"7302990019642261567",
//"6918979822425210891", // 0 打造通用型低
//"7202598408815640631", //前端依赖治理
//"7269673629964173331", // 前端可视化入门与实战
"7288940354408022074", //web动画之旅
},
SaveDir: "",
}
juejin, err := NewConfig(c)
if err != nil {
fmt.Println(err)
}
juejin.Download()
}
package juejinBook
import (
"encoding/json"
"fmt"
"log"
"myCrawler/utils"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
)
const (
// DefaultSaveDir default save dir.
DefaultSaveDir = "book"
// GetSectionURL get section url.
GetSectionURL = "https://api.juejin.cn/booklet_api/v1/section/get"
// GetBookInfoURL get book info url.
GetBookInfoURL = "https://api.juejin.cn/booklet_api/v1/booklet/get"
)
type JuejinListRequest struct {
CategoryID int
Cursor string
Sort int
IsVIP int
Limit int
}
func GetAllBookListSortLatestSaveToJSON() {
// 先从chrome 链接里爬取 , 然后找到 body里的参数进行 修改参数
url := "https://api.juejin.cn/booklet_api/v1/booklet/listbycategory?aid=2608&uuid=7220793504238650912&spider=0"
juejinListRequest := JuejinListRequest{
CategoryID: 0,
Cursor: "0",
Sort: 10,
IsVIP: 0,
Limit: 10000,
}
payloadByte, err := json.Marshal(juejinListRequest)
if err != nil {
return
}
response, err := utils.PostToStructInputStruct[JuejinResponse](url, payloadByte, "")
if err != nil {
panic(err)
}
fmt.Println("掘金一共有", len(response.Data), "本册子")
utils.WriteJSON(response, "juejin_book.json")
}
func NewConfig(config Config) (*Juejinxiaoce2Markdown, error) {
if config.Sessionid == "" {
return nil, fmt.Errorf("sessionid is empty")
}
if len(config.BookIDs) == 0 {
return nil, fmt.Errorf("bookIDs is empty")
}
pwd := os.Getenv("PWD")
if pwd == "" {
return nil, fmt.Errorf("PWD is empty")
}
defaultSaveDir := filepath.Join(pwd, DefaultSaveDir)
j := &Juejinxiaoce2Markdown{
ImgPattern: regexp.MustCompile(`!\[.*?\]\((.*?)\)`),
Sessionid: config.Sessionid,
BookIDs: config.BookIDs,
SaveDir: config.SaveDir,
RequestHeaders: map[string]string{"cookie": fmt.Sprintf("sessionid=%s;", config.Sessionid)},
MarkdownSavePaths: make(map[string]string),
}
if j.SaveDir == "" {
j.SaveDir = defaultSaveDir
}
if err := os.MkdirAll(j.SaveDir, os.ModePerm); err != nil {
return nil, fmt.Errorf("create save dir failed: %v", err)
}
return j, nil
}
func (j *Juejinxiaoce2Markdown) GetSectionRes(sectionID string) (JuejinSectionContent, error) {
data := map[string]string{
//"section_id": strconv.FormatInt(sectionID, 10),
"section_id": sectionID,
}
return utils.PostToStructInputStruct[JuejinSectionContent](GetSectionURL, data, j.Sessionid)
}
func (j *Juejinxiaoce2Markdown) GetBookInfoRes(bookID string) (JuejinSection, error) {
data := map[string]string{
"booklet_id": bookID,
}
return utils.PostToStructInputStruct[JuejinSection](GetBookInfoURL, data, j.Sessionid)
}
func dealBookAndSectionTitle(s string) string {
tmp := strings.ReplaceAll(s, "\\", "")
tmp = strings.ReplaceAll(tmp, "/", "")
tmp = strings.ReplaceAll(tmp, "|", "")
return tmp
}
func (j *Juejinxiaoce2Markdown) Download() {
// 并发 下载
fmt.Println("books need to download count ", len(j.BookIDs))
maxWorkerCount := 30
queue := make(chan string, maxWorkerCount)
runtime.GOMAXPROCS(runtime.NumCPU())
wg := sync.WaitGroup{}
for i := 0; i < maxWorkerCount; i++ {
go func() {
defer wg.Done()
wg.Add(1)
for bookId := range queue {
err := j.DownloadOneBook(bookId)
if err != nil {
fmt.Println("Error", err)
}
}
}()
}
for _, bookID := range j.BookIDs {
queue <- bookID
}
close(queue)
wg.Wait()
//utils.WriteJSON(salttigerItems, "salttigerItems.json")
}
func (j *Juejinxiaoce2Markdown) DownloadOneBook(bookID string) error {
log.Printf("开始处理小册")
juejinSection, err := j.GetBookInfoRes(bookID)
if err != nil {
return fmt.Errorf("GetBookInfoRes failed: %v", err)
}
bookTitle := dealBookAndSectionTitle(juejinSection.Data.Booklet.BaseInfo.Title)
log.Printf("book_title: %s", bookTitle)
bookSavePath := filepath.Join(j.SaveDir, bookTitle)
if err := os.MkdirAll(bookSavePath, os.ModePerm); err != nil {
return fmt.Errorf("create book save path failed: %v", err)
}
imgDir := filepath.Join(bookSavePath, "img")
if err := os.MkdirAll(imgDir, os.ModePerm); err != nil {
return fmt.Errorf("create img dir failed: %v", err)
}
sectionIDList := make([]string, 0, len(juejinSection.Data.Sections))
for _, section := range juejinSection.Data.Sections {
sectionIDList = append(sectionIDList, section.SectionID)
}
sectionTotalLength := len(sectionIDList)
for sectionIndex, sectionID := range sectionIDList {
sectionOrder := sectionIndex + 1
juejinSectionContent, err := j.GetSectionRes(sectionID)
if err != nil {
return fmt.Errorf("GetSectionRes failed: %v", err)
}
sectionTitle := dealBookAndSectionTitle(juejinSectionContent.Data.Section.Title)
markdownStr := juejinSectionContent.Data.Section.MarkdownShow
markdownFilePath := filepath.Join(bookSavePath, fmt.Sprintf("%d-%s.md", sectionOrder, sectionTitle))
sectionImgDir := filepath.Join(imgDir, strconv.Itoa(sectionOrder))
log.Printf("进度: %d/%d, 处理 section >> %s", sectionOrder, sectionTotalLength, sectionTitle)
if err := os.MkdirAll(sectionImgDir, os.ModePerm); err != nil {
return fmt.Errorf("create section img dir failed: %v", err)
}
markdownRelativeImgDir := filepath.Join("img", strconv.Itoa(sectionOrder))
j.MarkdownSavePaths[sectionID] = markdownFilePath
j.SaveMarkdown(sectionIndex, markdownFilePath, sectionImgDir, markdownRelativeImgDir, markdownStr)
}
log.Printf("处理完成")
return nil
}
func FindImageUrls(sectionIndex int, htmls string) []string {
if sectionIndex == 4 {
fmt.Println(sectionIndex)
}
imgRE := regexp.MustCompile(`<img[^>]+\bsrc=["']([^"']+)["']`)
imgs := imgRE.FindAllStringSubmatch(htmls, -1)
out := make([]string, 0)
for _, img := range imgs {
if strings.Contains(img[1], "http") {
out = append(out, strings.Replace(img[1], "\\", "", -1))
}
}
if len(out) <= 0 {
imgRE = regexp.MustCompile(`https://.*?\.(jpg|jpeg|gif|image|awebp|webp)`)
imgs := imgRE.FindAllStringSubmatch(htmls, -1)
out = make([]string, 0)
for _, img := range imgs {
if strings.Contains(img[0], "http") {
out = append(out, strings.Replace(img[0], "\\", "", -1))
}
}
}
return out
}
func (j *Juejinxiaoce2Markdown) SaveMarkdown(sectionIndex int, markdownFilePath string, sectionImgDir string, markdownRelativeImgDir string, markdownStr string) {
imgUrls := FindImageUrls(sectionIndex, markdownStr)
// 并发 下载
fmt.Println("sectionIndex images download count ", sectionIndex, len(imgUrls))
type Image struct {
imgUrl string
saveImagePath string
}
maxWorkerCount := 8
queue := make(chan *Image, maxWorkerCount)
runtime.GOMAXPROCS(runtime.NumCPU())
wg := sync.WaitGroup{}
for i := 0; i < maxWorkerCount; i++ {
go func() {
defer wg.Done()
wg.Add(1)
for image := range queue {
// Download image
err := utils.RequestThanSaveImage(image.imgUrl, image.saveImagePath)
if err != nil {
fmt.Println("Error downloading image:", err)
}
//time.Sleep(time.Second * 1)
}
}()
}
for imgIndex, imgUrl := range imgUrls {
newImgUrl := strings.TrimSpace(imgUrl) // Remove newlines and extra spaces
if strings.HasPrefix(newImgUrl, "//") {
newImgUrl = "https:" + newImgUrl // Add https:// if missing
}
suffix := filepath.Ext(newImgUrl)
suffix = ".png" // Get file extension
imgFileName := fmt.Sprintf("%d%s", imgIndex+1, suffix) // Generate filename
mdRelativeImgPath := filepath.Join(markdownRelativeImgDir, imgFileName) // Relative path for Markdown
imgSavePath := filepath.Join(sectionImgDir, imgFileName) // Full path to save image
// Replace URL in Markdown string with relative path
markdownStr = strings.ReplaceAll(markdownStr, imgUrl, mdRelativeImgPath)
queue <- &Image{
imgUrl: newImgUrl,
saveImagePath: imgSavePath,
}
}
close(queue)
wg.Wait()
err := os.WriteFile(markdownFilePath, []byte(markdownStr), 0644)
if err != nil {
fmt.Println("Error saving Markdown file:", err)
}
}