Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var ( ErrEmptyCrawlerName = errors.New("types/types.go empty crawler_name of crawler conf") ErrUnSupportedCrawlerType = errors.New("types/types.go unsupported crawler_type of crawler conf") ErrEmptyStartUrls = errors.New("types/types.go empty start_urls of crawler conf") ErrEmptyUrlsFile = errors.New("types/types.go empty urls_file of crawler conf") ErrNoStartRule = errors.New("types/types.go empty start task conf rule of crawler conf") )
Functions ¶
This section is empty.
Types ¶
type CrawlerConf ¶
type CrawlerConf struct { CrawlerType string `json:"crawler_type" bson:"crawler_type"` CrawlerName string `json:"crawler_name" bson:"crawler_name"` CrawlerDesp string `json:"crawler_desp" bson:"crawler_desp"` StartUrls []string `json:"start_urls" bson:"start_urls"` UrlsFile string `json:"urls_file" bson:"urls_file"` ParseConfs map[string]ParseConf `json:"parse_confs" bson:"parse_confs"` StartParserName string `json:"start_parser_name" bson:"start_parser_name"` EsUri string `json:"es_uri" bson:"es_uri"` }
func (*CrawlerConf) Id ¶
func (self *CrawlerConf) Id() string
func (*CrawlerConf) IsValid ¶
func (conf *CrawlerConf) IsValid() (bool, error)
func (*CrawlerConf) Type ¶
func (self *CrawlerConf) Type() string
type CrawlerItem ¶
type CrawlerItem struct { CrawlerName string `json:"crawler_name" bson:"crawler_name"` Conf CrawlerConf `json:"conf" bson:"conf"` Weight int `json:"weight" bson:"weight"` Status string `json:"status" bson:"status"` CreateTime int64 `json:"create_time" bson:"create_time"` ModifyTime int64 `json:"modify_time" bson:"modify_time"` Author string `json:"author" bson:"author"` }
type ParseConf ¶
type ParseConf struct { ParserType string `json:"parser_type" bson:"parser_type"` ParserName string `json:"parser_name" bson:"parser_name"` NoDefaultFields bool `json:"no_default_fields" bson:"no_default_fields"` ExampleUrl string `json:"example_url" bson:"example_url"` Rules map[string][]ParseRule `json:"rules" bson:"rules"` // RuleName to ParseRules PostProcessor string `json:"post_processor" bson:"post_processor"` RevisitInterval int64 `json:"revisit_interval" bson:"revisit_interval"` }
type ParseRule ¶
type ParseRule struct { // four RuleTypes: url, dom, string, html RuleType string `json:"rule_type" bson:"rule_type"` // when RuleType is dom, ItemKey stores the next RuleName ItemKey string `json:"item_key" bson:"item_key"` // IsSeedUrl indicates whether the generated item is a seed or not IsSeedUrl bool `json:"is_seed_url" bson:"is_seed_url"` Xpath string `json:"xpath" bson:"xpath"` Regex string `json:"regex" bson:"regex"` Js string `json:"js" bson:"js"` }
type Task ¶
type Task struct { CrawlerName string `json:"crawler_name" bson:"crawler_name"` ParserName string `json:"parser_name" bson:"parser_name"` IsSeedUrl bool `json:"is_seed_url" bson:"is_seed_url"` Url string `json:"url" bson:"url"` Data string `json:"data" bson:"data"` LastAccessTime int64 `json:"last_access_time" bson:"last_access_time"` RevisitInterval int64 `json:"revisit_interval" bson:"revisit_interval"` }
Click to show internal directories.
Click to hide internal directories.