1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
| package main
import ( "context" "fmt" "github.com/gocolly/colly" "github.com/gocolly/colly/debug" "github.com/zolamk/colly-mongo-storage/colly/mongo" "go.mongodb.org/mongo-driver/bson" mongo2 "go.mongodb.org/mongo-driver/mongo" "go.mongodb.org/mongo-driver/mongo/options" "time" )
type Book struct { Title string `json:"title"` Author string `json:"author"` Tag string `json:"tag"` Hot string `json:"hot"` Chars string `json:"chars"` }
const ( ROOT_URL = "http://xsw.xyz/shuku/" MONGO_URI = "mongodb://localhost:27017" ) var coll *mongo2.Collection func init() { client, err := mongo2.NewClient(options.Client().ApplyURI(MONGO_URI)) if err != nil { panic(err) } ctx,_ := context.WithTimeout(context.Background(), 10*time.Second) err = client.Connect(ctx) if err != nil { panic(err) } coll = client.Database("xsw").Collection("book_list") }
func main() { c := colly.NewCollector( colly.Debugger(&debug.LogDebugger{}), colly.AllowedDomains("xsw.xyz"), colly.CacheDir("./xsw"), colly.UserAgent("xsw"), colly.Async(true), ) storage := &mongo.Storage{ Database:"books_xsw", URI:MONGO_URI, } if err := c.SetStorage(storage); err != nil { panic(err) } c.Limit(&colly.LimitRule{ DomainGlob:"xsw.*,", Parallelism:2, RandomDelay:3*time.Second, }) detailCollector := c.Clone() c.OnHTML(".container .mod .bd", func(e *colly.HTMLElement) { fmt.Println("found list") e.ForEach("ul .column-2", func(_ int, el *colly.HTMLElement) { title := el.ChildText(".right .name") author := el.ChildText(".info .author") words := el.ChildText(".info .words") update := el.ChildText(".info font") ctx,_ := context.WithTimeout(context.Background(), 3*time.Second) _,err := coll.InsertOne(ctx,bson.M{"title":title,"author":author,"words":words,"update":update}) if err != nil { fmt.Println(err) } }) }) c.OnHTML(".nextPage", func(e *colly.HTMLElement) { e.Request.Visit(e.Attr("href")) }) c.OnError(func(response *colly.Response, e error) { fmt.Println("Request URL",response.Request.URL,", failed with response:",response,"\nError:",e) }) detailCollector.OnHTML("div[mod block book-all-list]", func(e *colly.HTMLElement) { }) c.Visit(ROOT_URL) time.Sleep(120*time.Second) }
|