Home Download Docs Code Community
     1	/*
     2	Copyright 2014 The Perkeep Authors
     3	
     4	Licensed under the Apache License, Version 2.0 (the "License");
     5	you may not use this file except in compliance with the License.
     6	You may obtain a copy of the License at
     7	
     8	     http://www.apache.org/licenses/LICENSE-2.0
     9	
    10	Unless required by applicable law or agreed to in writing, software
    11	distributed under the License is distributed on an "AS IS" BASIS,
    12	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13	See the License for the specific language governing permissions and
    14	limitations under the License.
    15	*/
    16	
    17	// Package feed implements an importer for RSS, Atom, and RDF feeds.
    18	package feed // import "perkeep.org/pkg/importer/feed"
    19	
    20	import (
    21		"bytes"
    22		"context"
    23		"fmt"
    24		"html/template"
    25		"io"
    26		"log"
    27		"net/http"
    28		"net/url"
    29	
    30		"perkeep.org/internal/httputil"
    31		"perkeep.org/pkg/blob"
    32		"perkeep.org/pkg/importer"
    33		"perkeep.org/pkg/schema"
    34		"perkeep.org/pkg/schema/nodeattr"
    35	
    36		"go4.org/ctxutil"
    37		"golang.org/x/net/html"
    38		"golang.org/x/net/html/atom"
    39	)
    40	
    41	const (
    42		// Permanode attributes on account node:
    43		acctAttrFeedURL = "feedURL"
    44	)
    45	
    46	func init() {
    47		importer.Register("feed", &imp{
    48			urlFileRef: make(map[string]blob.Ref),
    49		})
    50	}
    51	
    52	type imp struct {
    53		urlFileRef map[string]blob.Ref // url to file schema blob
    54	
    55		importer.OAuth1 // for CallbackRequestAccount and CallbackURLParameters
    56	}
    57	
    58	func (*imp) Properties() importer.Properties {
    59		return importer.Properties{
    60			Title:               "Feed",
    61			Description:         "importer for RSS, Atom, and RDF feeds",
    62			SupportsIncremental: true,
    63			NeedsAPIKey:         false,
    64		}
    65	}
    66	
    67	func (im *imp) IsAccountReady(acctNode *importer.Object) (ok bool, err error) {
    68		if acctNode.Attr(acctAttrFeedURL) != "" {
    69			return true, nil
    70		}
    71		return false, nil
    72	}
    73	
    74	func (im *imp) SummarizeAccount(acct *importer.Object) string {
    75		ok, err := im.IsAccountReady(acct)
    76		if err != nil {
    77			return "Not configured; error = " + err.Error()
    78		}
    79		if !ok {
    80			return "Not configured"
    81		}
    82		return fmt.Sprintf("feed %s", acct.Attr(acctAttrFeedURL))
    83	}
    84	
    85	// A run is our state for a given run of the importer.
    86	type run struct {
    87		*importer.RunContext
    88		im *imp
    89	}
    90	
    91	func (im *imp) Run(ctx *importer.RunContext) error {
    92		r := &run{
    93			RunContext: ctx,
    94			im:         im,
    95		}
    96	
    97		if err := r.importFeed(); err != nil {
    98			return err
    99		}
   100		return nil
   101	}
   102	
   103	func (r *run) importFeed() error {
   104		accountNode := r.RunContext.AccountNode()
   105		feedURL, err := url.Parse(accountNode.Attr(acctAttrFeedURL))
   106		if err != nil {
   107			return err
   108		}
   109		body, err := doGet(r.Context(), feedURL.String())
   110		if err != nil {
   111			return err
   112		}
   113		if auto, err := autoDiscover(body); err == nil {
   114			if autoURL, err := url.Parse(auto); err == nil {
   115				if autoURL.Scheme == "" {
   116					autoURL.Scheme = feedURL.Scheme
   117				}
   118				if autoURL.Host == "" {
   119					autoURL.Host = feedURL.Host
   120				}
   121				body, err = doGet(r.Context(), autoURL.String())
   122				if err != nil {
   123					return err
   124				}
   125			}
   126		}
   127		feed, err := parseFeed(body, feedURL.String())
   128		if err != nil {
   129			return err
   130		}
   131		itemsNode := r.RootNode()
   132		if accountNode.Attr("title") == "" {
   133			accountNode.SetAttr("title", fmt.Sprintf("%s Feed", feed.Title))
   134		}
   135		if itemsNode.Attr("title") == "" {
   136			itemsNode.SetAttr("title", fmt.Sprintf("%s Items", feed.Title))
   137		}
   138		for _, item := range feed.Items {
   139			if err := r.importItem(itemsNode, item); err != nil {
   140				log.Printf("Feed importer: error importing item %s %v", item.ID, err)
   141				continue
   142			}
   143		}
   144		return nil
   145	}
   146	
   147	func (r *run) importItem(parent *importer.Object, item *item) error {
   148		itemNode, err := parent.ChildPathObject(item.ID)
   149		if err != nil {
   150			return err
   151		}
   152		fileRef, err := schema.WriteFileFromReader(r.Context(), r.Host.Target(), "", bytes.NewBufferString(item.Content))
   153		if err != nil {
   154			return err
   155		}
   156		if err := itemNode.SetAttrs(
   157			nodeattr.Type, "feed:item",
   158			nodeattr.Title, item.Title,
   159			nodeattr.CamliContent, fileRef.String(),
   160			"link", item.Link,
   161			"feedItemId", item.ID,
   162			"author", item.Author,
   163			"feedMediaContentURL", item.MediaContent,
   164		); err != nil {
   165			return err
   166		}
   167	
   168		if !item.Updated.IsZero() {
   169			if err := itemNode.SetAttr(nodeattr.DateModified, schema.RFC3339FromTime(item.Updated)); err != nil {
   170				return err
   171			}
   172		}
   173	
   174		if !item.Published.IsZero() {
   175			if err := itemNode.SetAttr(nodeattr.DatePublished, schema.RFC3339FromTime(item.Published)); err != nil {
   176				return err
   177			}
   178		}
   179	
   180		if !item.Created.IsZero() {
   181			if err := itemNode.SetAttr(nodeattr.DateCreated, schema.RFC3339FromTime(item.Created)); err != nil {
   182				return err
   183			}
   184		}
   185		return nil
   186	}
   187	
   188	// autodiscover takes an HTML document and returns the autodiscovered feed
   189	// URL. Returns an error if there is no such URL.
   190	func autoDiscover(body []byte) (feedURL string, err error) {
   191		r := bytes.NewReader(body)
   192		z := html.NewTokenizer(r)
   193		for {
   194			if z.Next() == html.ErrorToken {
   195				break
   196			}
   197			t := z.Token()
   198			switch t.DataAtom {
   199			case atom.Link:
   200				if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
   201					attrs := make(map[string]string)
   202					for _, a := range t.Attr {
   203						attrs[a.Key] = a.Val
   204					}
   205					if attrs["rel"] == "alternate" && attrs["href"] != "" &&
   206						(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
   207						return attrs["href"], nil
   208					}
   209				}
   210			}
   211		}
   212		return "", fmt.Errorf("No feed link found")
   213	}
   214	
   215	func doGet(ctx context.Context, url string) ([]byte, error) {
   216		req, err := http.NewRequest("GET", url, nil)
   217		if err != nil {
   218			return nil, err
   219		}
   220		res, err := ctxutil.Client(ctx).Do(req)
   221		if err != nil {
   222			log.Printf("Error fetching %s: %v", url, err)
   223			return nil, err
   224		}
   225		defer res.Body.Close()
   226		if res.StatusCode != http.StatusOK {
   227			return nil, fmt.Errorf("Get request on %s failed with: %s", url, res.Status)
   228		}
   229		return io.ReadAll(io.LimitReader(res.Body, 8<<20))
   230	}
   231	
   232	func (im *imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error {
   233		return tmpl.ExecuteTemplate(w, "serveSetup", ctx)
   234	}
   235	
   236	var tmpl = template.Must(template.New("root").Parse(`
   237	{{define "serveSetup"}}
   238	<h1>Configuring Feed</h1>
   239	<form method="get" action="{{.CallbackURL}}">
   240	  <input type="hidden" name="acct" value="{{.AccountNode.PermanodeRef}}">
   241	  <table border=0 cellpadding=3>
   242	  <tr><td align=right>Feed URL</td><td><input name="feedURL" size=50></td></tr>
   243	  <tr><td align=right></td><td><input type="submit" value="Add"></td></tr>
   244	  </table>
   245	</form>
   246	{{end}}
   247	`))
   248	
   249	func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) {
   250		u := r.FormValue("feedURL")
   251		if u == "" {
   252			http.Error(w, "Expected a feed URL", http.StatusBadRequest)
   253			return
   254		}
   255		feed, err := url.Parse(u)
   256		if err != nil {
   257			httputil.ServeError(w, r, err)
   258			return
   259		}
   260		if feed.Scheme == "" {
   261			feed.Scheme = "http"
   262		}
   263		if err := ctx.AccountNode.SetAttrs(
   264			acctAttrFeedURL, feed.String(),
   265		); err != nil {
   266			httputil.ServeError(w, r, fmt.Errorf("Error setting attribute: %v", err))
   267			return
   268		}
   269		http.Redirect(w, r, ctx.AccountURL(), http.StatusFound)
   270	}
Website layout inspired by memcached.
Content by the authors.