1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16
17
18 package feed
19
20 import (
21 "bytes"
22 "context"
23 "fmt"
24 "html/template"
25 "io"
26 "log"
27 "net/http"
28 "net/url"
29
30 "perkeep.org/internal/httputil"
31 "perkeep.org/pkg/blob"
32 "perkeep.org/pkg/importer"
33 "perkeep.org/pkg/schema"
34 "perkeep.org/pkg/schema/nodeattr"
35
36 "go4.org/ctxutil"
37 "golang.org/x/net/html"
38 "golang.org/x/net/html/atom"
39 )
40
41 const (
42
43 acctAttrFeedURL = "feedURL"
44 )
45
46 func init() {
47 importer.Register("feed", &imp{
48 urlFileRef: make(map[string]blob.Ref),
49 })
50 }
51
52 type imp struct {
53 urlFileRef map[string]blob.Ref
54
55 importer.OAuth1
56 }
57
58 func (*imp) Properties() importer.Properties {
59 return importer.Properties{
60 Title: "Feed",
61 Description: "importer for RSS, Atom, and RDF feeds",
62 SupportsIncremental: true,
63 NeedsAPIKey: false,
64 }
65 }
66
67 func (im *imp) IsAccountReady(acctNode *importer.Object) (ok bool, err error) {
68 if acctNode.Attr(acctAttrFeedURL) != "" {
69 return true, nil
70 }
71 return false, nil
72 }
73
74 func (im *imp) SummarizeAccount(acct *importer.Object) string {
75 ok, err := im.IsAccountReady(acct)
76 if err != nil {
77 return "Not configured; error = " + err.Error()
78 }
79 if !ok {
80 return "Not configured"
81 }
82 return fmt.Sprintf("feed %s", acct.Attr(acctAttrFeedURL))
83 }
84
85
86 type run struct {
87 *importer.RunContext
88 im *imp
89 }
90
91 func (im *imp) Run(ctx *importer.RunContext) error {
92 r := &run{
93 RunContext: ctx,
94 im: im,
95 }
96
97 if err := r.importFeed(); err != nil {
98 return err
99 }
100 return nil
101 }
102
103 func (r *run) importFeed() error {
104 accountNode := r.RunContext.AccountNode()
105 feedURL, err := url.Parse(accountNode.Attr(acctAttrFeedURL))
106 if err != nil {
107 return err
108 }
109 body, err := doGet(r.Context(), feedURL.String())
110 if err != nil {
111 return err
112 }
113 if auto, err := autoDiscover(body); err == nil {
114 if autoURL, err := url.Parse(auto); err == nil {
115 if autoURL.Scheme == "" {
116 autoURL.Scheme = feedURL.Scheme
117 }
118 if autoURL.Host == "" {
119 autoURL.Host = feedURL.Host
120 }
121 body, err = doGet(r.Context(), autoURL.String())
122 if err != nil {
123 return err
124 }
125 }
126 }
127 feed, err := parseFeed(body, feedURL.String())
128 if err != nil {
129 return err
130 }
131 itemsNode := r.RootNode()
132 if accountNode.Attr("title") == "" {
133 accountNode.SetAttr("title", fmt.Sprintf("%s Feed", feed.Title))
134 }
135 if itemsNode.Attr("title") == "" {
136 itemsNode.SetAttr("title", fmt.Sprintf("%s Items", feed.Title))
137 }
138 for _, item := range feed.Items {
139 if err := r.importItem(itemsNode, item); err != nil {
140 log.Printf("Feed importer: error importing item %s %v", item.ID, err)
141 continue
142 }
143 }
144 return nil
145 }
146
147 func (r *run) importItem(parent *importer.Object, item *item) error {
148 itemNode, err := parent.ChildPathObject(item.ID)
149 if err != nil {
150 return err
151 }
152 fileRef, err := schema.WriteFileFromReader(r.Context(), r.Host.Target(), "", bytes.NewBufferString(item.Content))
153 if err != nil {
154 return err
155 }
156 if err := itemNode.SetAttrs(
157 nodeattr.Type, "feed:item",
158 nodeattr.Title, item.Title,
159 nodeattr.CamliContent, fileRef.String(),
160 "link", item.Link,
161 "feedItemId", item.ID,
162 "author", item.Author,
163 "feedMediaContentURL", item.MediaContent,
164 ); err != nil {
165 return err
166 }
167
168 if !item.Updated.IsZero() {
169 if err := itemNode.SetAttr(nodeattr.DateModified, schema.RFC3339FromTime(item.Updated)); err != nil {
170 return err
171 }
172 }
173
174 if !item.Published.IsZero() {
175 if err := itemNode.SetAttr(nodeattr.DatePublished, schema.RFC3339FromTime(item.Published)); err != nil {
176 return err
177 }
178 }
179
180 if !item.Created.IsZero() {
181 if err := itemNode.SetAttr(nodeattr.DateCreated, schema.RFC3339FromTime(item.Created)); err != nil {
182 return err
183 }
184 }
185 return nil
186 }
187
188
189
190 func autoDiscover(body []byte) (feedURL string, err error) {
191 r := bytes.NewReader(body)
192 z := html.NewTokenizer(r)
193 for {
194 if z.Next() == html.ErrorToken {
195 break
196 }
197 t := z.Token()
198 switch t.DataAtom {
199 case atom.Link:
200 if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
201 attrs := make(map[string]string)
202 for _, a := range t.Attr {
203 attrs[a.Key] = a.Val
204 }
205 if attrs["rel"] == "alternate" && attrs["href"] != "" &&
206 (attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
207 return attrs["href"], nil
208 }
209 }
210 }
211 }
212 return "", fmt.Errorf("No feed link found")
213 }
214
215 func doGet(ctx context.Context, url string) ([]byte, error) {
216 req, err := http.NewRequest("GET", url, nil)
217 if err != nil {
218 return nil, err
219 }
220 res, err := ctxutil.Client(ctx).Do(req)
221 if err != nil {
222 log.Printf("Error fetching %s: %v", url, err)
223 return nil, err
224 }
225 defer res.Body.Close()
226 if res.StatusCode != http.StatusOK {
227 return nil, fmt.Errorf("Get request on %s failed with: %s", url, res.Status)
228 }
229 return io.ReadAll(io.LimitReader(res.Body, 8<<20))
230 }
231
232 func (im *imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error {
233 return tmpl.ExecuteTemplate(w, "serveSetup", ctx)
234 }
235
236 var tmpl = template.Must(template.New("root").Parse(`
237 {{define "serveSetup"}}
238 <h1>Configuring Feed</h1>
239 <form method="get" action="{{.CallbackURL}}">
240 <input type="hidden" name="acct" value="{{.AccountNode.PermanodeRef}}">
241 <table border=0 cellpadding=3>
242 <tr><td align=right>Feed URL</td><td><input name="feedURL" size=50></td></tr>
243 <tr><td align=right></td><td><input type="submit" value="Add"></td></tr>
244 </table>
245 </form>
246 {{end}}
247 `))
248
249 func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) {
250 u := r.FormValue("feedURL")
251 if u == "" {
252 http.Error(w, "Expected a feed URL", http.StatusBadRequest)
253 return
254 }
255 feed, err := url.Parse(u)
256 if err != nil {
257 httputil.ServeError(w, r, err)
258 return
259 }
260 if feed.Scheme == "" {
261 feed.Scheme = "http"
262 }
263 if err := ctx.AccountNode.SetAttrs(
264 acctAttrFeedURL, feed.String(),
265 ); err != nil {
266 httputil.ServeError(w, r, fmt.Errorf("Error setting attribute: %v", err))
267 return
268 }
269 http.Redirect(w, r, ctx.AccountURL(), http.StatusFound)
270 }