Documentation
¶
Index ¶
- Constants
- Variables
- type Crawler
- type CrawlerQueue
- type Decoder
- type Document
- type DocumentType
- type FTPProto
- type FileProto
- type GzipDecoder
- type HTMLDecoder
- type HTTPProto
- type ImageDecoder
- type MediaDecoder
- type PDFDecoder
- type Proto
- type SFTPProto
- type Settings
- type StdCrawlerQueue
- type TarDecoder
- type ZIPDecoder
Constants ¶
View Source
const ( DocumentFile = DocumentType(iota) DocumentDirectory DocumentLink DocumentSpecial )
Variables ¶
View Source
var ( DefaultSettings = Settings{ Protos: map[string]Proto{ "http": HTTPProto{}, "https": HTTPProto{}, "ftp": NewFTPProto(), "sftp": NewSFTPProto(), }, Decoders: map[string]Decoder{ "text/html": DefaultHTMLDecoder, "application/xhtml+xml": DefaultHTMLDecoder, "application/pdf": DefaultPDFDecoder, "image/png": DefaultImageDecoder, "image/jpeg": DefaultImageDecoder, "image/gif": DefaultImageDecoder, "video/webm": DefaultMediaDecoder, "audio/mpeg": DefaultMediaDecoder, "application/ogg": DefaultMediaDecoder, "application/zip": DefaultZIPDecoder, "application/x-gzip": DefaultGzipDecoder, }, } DefaultCrawler = Crawler{ Settings: DefaultSettings, Queue: make(StdCrawlerQueue), Set: *set.New(), Output: make(chan Document), } )
View Source
var ( DefaultHTMLDecoder = HTMLDecoder{} DefaultPDFDecoder = PDFDecoder{} DefaultImageDecoder = ImageDecoder{} DefaultMediaDecoder = MediaDecoder{} DefaultZIPDecoder = ZIPDecoder{} DefaultGzipDecoder = GzipDecoder{} )
Functions ¶
This section is empty.
Types ¶
type Crawler ¶
type Crawler struct { Settings Queue CrawlerQueue Set set.Set Output chan Document ErrC chan error CloseC chan struct{} }
func NewCrawler ¶
func NewCrawler() *Crawler
type CrawlerQueue ¶
type CrawlerQueue interface { // Send sends the list of urls in given order to the queue Send(urls ...string) // Recv receives one url from the queue and returns it. It may block. Recv() (url string) }
A CrawlerQueue is in most cases the same as a channel to send and receive strings, but provides two methods instead for external queue systems like Redis and RabbitMq
type Document ¶
type Document struct { URL *url.URL Type DocumentType ContentType string Time time.Time Size int64 Links []string Title string Version string Album string Artist string Performer string Copyright string License string Organisation string Genre string Date string ISRC string Author string Description string Content string NoIndex bool NoFollow bool }
type DocumentType ¶
type DocumentType int
type FTPProto ¶
type FTPProto struct {
// contains filtered or unexported fields
}
func NewFTPProto ¶
func NewFTPProto() *FTPProto
type GzipDecoder ¶
type GzipDecoder struct {
Tar TarDecoder
}
func (GzipDecoder) Decode ¶
func (g GzipDecoder) Decode(doc *Document, rc io.ReadCloser) error
type HTMLDecoder ¶
type HTMLDecoder struct{}
func (HTMLDecoder) Decode ¶
func (h HTMLDecoder) Decode(doc *Document, rc io.ReadCloser) error
type ImageDecoder ¶
type ImageDecoder struct{}
func (ImageDecoder) Decode ¶
func (i ImageDecoder) Decode(doc *Document, rc io.ReadCloser) error
type MediaDecoder ¶
type MediaDecoder struct{}
func (MediaDecoder) Decode ¶
func (m MediaDecoder) Decode(doc *Document, rc io.ReadCloser) error
type PDFDecoder ¶
type PDFDecoder struct{}
func (PDFDecoder) Decode ¶
func (p PDFDecoder) Decode(doc *Document, rc io.ReadCloser) error
type SFTPProto ¶
type SFTPProto struct {
// contains filtered or unexported fields
}
func NewSFTPProto ¶
func NewSFTPProto() *SFTPProto
type StdCrawlerQueue ¶
type StdCrawlerQueue chan string
StdCrawlerQueue is a string channel with methods required by CrawlerQueue
func (StdCrawlerQueue) Recv ¶
func (s StdCrawlerQueue) Recv() string
Recv receives one string and is just a wrapper for <-c, but is needed to fulfill the CrawlerQueue interface.
func (StdCrawlerQueue) Send ¶
func (s StdCrawlerQueue) Send(urls ...string)
Send sends the urls to the string channel. It's just a wrapper for c <- url, but is needed to fulfill the CrawlerQueue interface.
type TarDecoder ¶
type TarDecoder struct{}
func (TarDecoder) Decode ¶
func (t TarDecoder) Decode(doc *Document, rc io.ReadCloser) error
type ZIPDecoder ¶
type ZIPDecoder struct{}
func (ZIPDecoder) Decode ¶
func (z ZIPDecoder) Decode(doc *Document, rc io.ReadCloser) error
Click to show internal directories.
Click to hide internal directories.