Skip to content

Commit f3e914d

Browse files
committed
简单爬虫练习
1 parent d741597 commit f3e914d

File tree

2 files changed

+307
-0
lines changed

2 files changed

+307
-0
lines changed

news_spider/news_spider.go

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
package main
2+
3+
import (
4+
"database/sql"
5+
"fmt"
6+
"github.com/PuerkitoBio/goquery"
7+
_ "github.com/go-sql-driver/mysql"
8+
"time"
9+
)
10+
11+
type NewsSpider struct {
12+
TargetUrl string
13+
TargetSource string
14+
MysqlDSN string
15+
InsertStmt string
16+
QueryStmt string
17+
Duration int
18+
}
19+
20+
type Paper struct {
21+
Title string
22+
ImgAddr string
23+
Desc string
24+
Content string
25+
Author string
26+
Time string
27+
}
28+
29+
func main() {
30+
(&NewsSpider{
31+
TargetUrl: "https://news.jin10.com",
32+
TargetSource: "金十数据",
33+
MysqlDSN: "root:123456@tcp(localhost:3306)/phdb?charset=utf8",
34+
InsertStmt: "INSERT INTO consult_info( consult_title, pic_addr, consult_desc, consult_content, consult_source, consult_author, create_date ) VALUES( ?, ?, ?, ?, ?, ?, ? )",
35+
QueryStmt: "SELECT IF(COUNT(*),'true','false') FROM consult_info WHERE consult_title = ?",
36+
Duration: 30,
37+
}).Run()
38+
}
39+
40+
func (ns *NewsSpider) Run() {
41+
ticker := time.NewTicker(time.Duration(ns.Duration) * time.Minute)
42+
ns.newsSpider()
43+
for range ticker.C {
44+
ns.newsSpider()
45+
}
46+
}
47+
48+
// 新闻爬虫
49+
func (ns *NewsSpider) newsSpider() {
50+
// mysql 初始化
51+
db, err := sql.Open("mysql", ns.MysqlDSN)
52+
if nil != err {
53+
time.Sleep(time.Duration(1) * time.Minute)
54+
if db, err = sql.Open("mysql", ns.MysqlDSN); nil != err {
55+
panic(err.Error())
56+
}
57+
}
58+
defer db.Close()
59+
60+
doc, err := goquery.NewDocument(ns.TargetUrl)
61+
if nil != err {
62+
return
63+
}
64+
65+
paper := Paper{}
66+
// 遍历类节点
67+
doc.Find(".jin-newsList__item").Each(func(i int, s *goquery.Selection) {
68+
// 文章封面
69+
a_img := s.Find(".J_lazyImg").Eq(0)
70+
if nil == a_img {
71+
return
72+
}
73+
img, _ := a_img.Attr("data-original")
74+
75+
// 文章链接
76+
a_href := s.Find("a").Eq(0)
77+
if nil == a_href {
78+
return
79+
}
80+
// 抓取详情
81+
article_id, _ := a_href.Attr("href")
82+
article_href := fmt.Sprintf("%s%s", ns.TargetUrl, article_id)
83+
// 获取详情的dom
84+
dom, err := goquery.NewDocument(article_href)
85+
if nil != err {
86+
return
87+
}
88+
// 设定来源
89+
//source := "金十数据"
90+
// 观看次数
91+
// hit := dom.Find(".jin-meta p").Eq(0).Text()
92+
// 评论数
93+
// msg := dom.Find(".jin-meta p").Eq(1).Text()
94+
95+
// paper := Paper{
96+
// Title: dom.Find(".jin-news-article_title").Text(), // 新闻标题
97+
// ImgAddr: img,
98+
// Desc: dom.Find(".jin-news-article_description").Text(), // 文章描述
99+
// Content: dom.Find(".jin-news-article_content").Text(), // 文章内容
100+
// Author: dom.Find(".jin-meta p").Eq(3).Text(),
101+
// Time: dom.Find(".jin-meta p").Eq(2).Text() + " " + s.Find(".jin-newsList__time").Text(), // 发布日期
102+
// }
103+
104+
paper.Title = dom.Find(".jin-news-article_title").Text() // 新闻标题
105+
paper.ImgAddr = img
106+
paper.Desc = dom.Find(".jin-news-article_description").Text() // 文章描述
107+
paper.Content = dom.Find(".jin-news-article_content").Text() // 文章内容
108+
paper.Author = dom.Find(".jin-meta p").Eq(3).Text()
109+
paper.Time = dom.Find(".jin-meta p").Eq(2).Text() + " " + s.Find(".jin-newsList__time").Text() // 发布日期
110+
111+
// mysql 查询
112+
if isExist, err := ns.fetchRow(db, paper.Title); nil == err && !isExist {
113+
ns.insert(db, paper.Title, paper.ImgAddr, paper.Desc, paper.Content, ns.TargetSource, paper.Author, paper.Time)
114+
}
115+
})
116+
}
117+
118+
//插入
119+
func (ns *NewsSpider) insert(db *sql.DB, args ...interface{}) (int64, error) {
120+
stmtIns, err := db.Prepare(ns.InsertStmt)
121+
if err != nil {
122+
return 0, err
123+
}
124+
defer stmtIns.Close()
125+
126+
result, err := stmtIns.Exec(args...)
127+
if err != nil {
128+
return 0, err
129+
}
130+
return result.LastInsertId()
131+
}
132+
133+
//取一行数据,
134+
func (ns *NewsSpider) fetchRow(db *sql.DB, args ...interface{}) (isExist bool, err error) {
135+
stmtOut, err := db.Prepare(ns.QueryStmt)
136+
if err != nil {
137+
return
138+
}
139+
defer stmtOut.Close()
140+
141+
err = stmtOut.QueryRow(args...).Scan(&isExist)
142+
return
143+
}

news_spider/news_spider_list.go

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
package main
2+
3+
import (
4+
"database/sql"
5+
"fmt"
6+
"github.com/PuerkitoBio/goquery"
7+
_ "github.com/go-sql-driver/mysql"
8+
"sync"
9+
"time"
10+
)
11+
12+
type NewsSpider struct {
13+
TargetUrl string
14+
TargetSource string
15+
MysqlDSN string
16+
InsertStmt string
17+
QueryStmt string
18+
Duration int
19+
}
20+
21+
type Paper struct {
22+
Title string
23+
ImgAddr string
24+
Desc string
25+
Content string
26+
Author string
27+
Time string
28+
}
29+
30+
var (
31+
NewsSpiderList = []*NewsSpider{
32+
&NewsSpider{
33+
TargetUrl: "https://news.jin10.com",
34+
TargetSource: "金十数据",
35+
MysqlDSN: "root:123456@tcp(localhost:3306)/phdb?charset=utf8",
36+
InsertStmt: "INSERT INTO consult_info( consult_title, pic_addr, consult_desc, consult_content, consult_source, consult_author, create_date ) VALUES( ?, ?, ?, ?, ?, ?, ? )",
37+
QueryStmt: "SELECT IF(COUNT(*),'true','false') FROM consult_info WHERE consult_title = ?",
38+
Duration: 30,
39+
},
40+
&NewsSpider{
41+
TargetUrl: "https://news.jin10.com",
42+
TargetSource: "金十数据",
43+
MysqlDSN: "root:123456@tcp(localhost:3306)/phdb?charset=utf8",
44+
InsertStmt: "INSERT INTO consult_info( consult_title, pic_addr, consult_desc, consult_content, consult_source, consult_author, create_date ) VALUES( ?, ?, ?, ?, ?, ?, ? )",
45+
QueryStmt: "SELECT IF(COUNT(*),'true','false') FROM consult_info WHERE consult_title = ?",
46+
Duration: 5,
47+
},
48+
}
49+
)
50+
51+
func main() {
52+
var wg sync.WaitGroup
53+
wg.Add(len(NewsSpiderList))
54+
for _, ns := range NewsSpiderList {
55+
go ns.Run(&wg)
56+
}
57+
wg.Wait()
58+
}
59+
60+
func (ns *NewsSpider) Run(wg *sync.WaitGroup) {
61+
defer wg.Done()
62+
ticker := time.NewTicker(time.Duration(ns.Duration) * time.Minute)
63+
ns.newsSpider()
64+
for range ticker.C {
65+
ns.newsSpider()
66+
}
67+
}
68+
69+
// 新闻爬虫
70+
func (ns *NewsSpider) newsSpider() {
71+
// mysql 初始化
72+
db, err := sql.Open("mysql", ns.MysqlDSN)
73+
if nil != err {
74+
time.Sleep(time.Duration(1) * time.Minute)
75+
if db, err = sql.Open("mysql", ns.MysqlDSN); nil != err {
76+
panic(err.Error())
77+
}
78+
}
79+
defer db.Close()
80+
81+
doc, err := goquery.NewDocument(ns.TargetUrl)
82+
if nil != err {
83+
return
84+
}
85+
86+
paper := Paper{}
87+
// 遍历类节点
88+
doc.Find(".jin-newsList__item").Each(func(i int, s *goquery.Selection) {
89+
// 文章封面
90+
a_img := s.Find(".J_lazyImg").Eq(0)
91+
if nil == a_img {
92+
return
93+
}
94+
img, _ := a_img.Attr("data-original")
95+
96+
// 文章链接
97+
a_href := s.Find("a").Eq(0)
98+
if nil == a_href {
99+
return
100+
}
101+
// 抓取详情
102+
article_id, _ := a_href.Attr("href")
103+
article_href := fmt.Sprintf("%s%s", ns.TargetUrl, article_id)
104+
// 获取详情的dom
105+
dom, err := goquery.NewDocument(article_href)
106+
if nil != err {
107+
return
108+
}
109+
// 设定来源
110+
//source := "金十数据"
111+
// 观看次数
112+
// hit := dom.Find(".jin-meta p").Eq(0).Text()
113+
// 评论数
114+
// msg := dom.Find(".jin-meta p").Eq(1).Text()
115+
116+
// paper := Paper{
117+
// Title: dom.Find(".jin-news-article_title").Text(), // 新闻标题
118+
// ImgAddr: img,
119+
// Desc: dom.Find(".jin-news-article_description").Text(), // 文章描述
120+
// Content: dom.Find(".jin-news-article_content").Text(), // 文章内容
121+
// Author: dom.Find(".jin-meta p").Eq(3).Text(),
122+
// Time: dom.Find(".jin-meta p").Eq(2).Text() + " " + s.Find(".jin-newsList__time").Text(), // 发布日期
123+
// }
124+
125+
paper.Title = dom.Find(".jin-news-article_title").Text() // 新闻标题
126+
paper.ImgAddr = img
127+
paper.Desc = dom.Find(".jin-news-article_description").Text() // 文章描述
128+
paper.Content = dom.Find(".jin-news-article_content").Text() // 文章内容
129+
paper.Author = dom.Find(".jin-meta p").Eq(3).Text()
130+
paper.Time = dom.Find(".jin-meta p").Eq(2).Text() + " " + s.Find(".jin-newsList__time").Text() // 发布日期
131+
132+
// mysql 查询
133+
if isExist, err := ns.fetchRow(db, paper.Title); nil == err && !isExist {
134+
ns.insert(db, paper.Title, paper.ImgAddr, paper.Desc, paper.Content, ns.TargetSource, paper.Author, paper.Time)
135+
}
136+
})
137+
}
138+
139+
//插入
140+
func (ns *NewsSpider) insert(db *sql.DB, args ...interface{}) (int64, error) {
141+
stmtIns, err := db.Prepare(ns.InsertStmt)
142+
if err != nil {
143+
return 0, err
144+
}
145+
defer stmtIns.Close()
146+
147+
result, err := stmtIns.Exec(args...)
148+
if err != nil {
149+
return 0, err
150+
}
151+
return result.LastInsertId()
152+
}
153+
154+
//取一行数据,
155+
func (ns *NewsSpider) fetchRow(db *sql.DB, args ...interface{}) (isExist bool, err error) {
156+
stmtOut, err := db.Prepare(ns.QueryStmt)
157+
if err != nil {
158+
return
159+
}
160+
defer stmtOut.Close()
161+
162+
err = stmtOut.QueryRow(args...).Scan(&isExist)
163+
return
164+
}

0 commit comments

Comments
 (0)