1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16
17 package feed
18
19 import (
20 "bytes"
21 "encoding/xml"
22 "fmt"
23 "html"
24 "log"
25 "net/url"
26 "strings"
27 "time"
28
29 "golang.org/x/net/html/charset"
30 "perkeep.org/pkg/importer/feed/atom"
31 "perkeep.org/pkg/importer/feed/rdf"
32 "perkeep.org/pkg/importer/feed/rss"
33 )
34
35 type feed struct {
36 Title string
37 Updated time.Time
38 Link string
39 Items []*item
40 }
41
42 type item struct {
43 ID string
44 Title string
45 Link string
46 Created time.Time
47 Published time.Time
48 Updated time.Time
49 Author string
50 Content string
51 MediaContent string
52 }
53
54 func parseFeed(body []byte, feedURL string) (*feed, error) {
55 var f *feed
56 var atomerr, rsserr, rdferr error
57 f, atomerr = parseAtom(body)
58 if f == nil {
59 f, rsserr = parseRSS(body)
60 }
61 if f == nil {
62 f, rdferr = parseRDF(body)
63 }
64 if f == nil {
65 log.Printf("atom parse error: %s", atomerr.Error())
66 log.Printf("xml parse error: %s", rsserr.Error())
67 log.Printf("rdf parse error: %s", rdferr.Error())
68 return nil, fmt.Errorf("Could not parse feed data")
69 }
70 return f, nil
71 }
72
73 func parseAtom(body []byte) (*feed, error) {
74 var f feed
75 var a atom.Feed
76 d := xml.NewDecoder(bytes.NewReader(body))
77 d.CharsetReader = charset.NewReaderLabel
78 if err := d.Decode(&a); err != nil {
79 return nil, err
80 }
81 f.Title = a.Title
82 if t, err := parseDate(string(a.Updated)); err == nil {
83 f.Updated = t
84 }
85 fb, err := url.Parse(a.XMLBase)
86 if err != nil {
87 fb, _ = url.Parse("")
88 }
89 if len(a.Link) > 0 {
90 f.Link = findBestAtomLink(a.Link)
91 if l, err := fb.Parse(f.Link); err == nil {
92 f.Link = l.String()
93 }
94 }
95
96 for _, i := range a.Entry {
97 eb, err := fb.Parse(i.XMLBase)
98 if err != nil {
99 eb = fb
100 }
101 st := item{
102 ID: i.ID,
103 Title: atomTitle(i.Title),
104 }
105 if t, err := parseDate(string(i.Updated)); err == nil {
106 st.Updated = t
107 }
108 if t, err := parseDate(string(i.Published)); err == nil {
109 st.Published = t
110 }
111 if len(i.Link) > 0 {
112 st.Link = findBestAtomLink(i.Link)
113 if l, err := eb.Parse(st.Link); err == nil {
114 st.Link = l.String()
115 }
116 }
117 if i.Author != nil {
118 st.Author = i.Author.Name
119 }
120 if i.Content != nil {
121 if len(strings.TrimSpace(i.Content.Body)) != 0 {
122 st.Content = i.Content.Body
123 } else if len(i.Content.InnerXML) != 0 {
124 st.Content = i.Content.InnerXML
125 }
126 } else if i.Summary != nil {
127 st.Content = i.Summary.Body
128 }
129 f.Items = append(f.Items, &st)
130 }
131 return &f, nil
132 }
133
134 func parseRSS(body []byte) (*feed, error) {
135 var f feed
136 var r rss.RSS
137 d := xml.NewDecoder(bytes.NewReader(body))
138 d.CharsetReader = charset.NewReaderLabel
139 d.DefaultSpace = "DefaultSpace"
140 if err := d.Decode(&r); err != nil {
141 return nil, err
142 }
143 f.Title = r.Title
144 if t, err := parseDate(r.LastBuildDate, r.PubDate); err == nil {
145 f.Updated = t
146 }
147 f.Link = r.BaseLink()
148
149 for _, i := range r.Items {
150 st := item{
151 Link: i.Link,
152 Author: i.Author,
153 }
154 if i.Content != "" {
155 st.Content = i.Content
156 } else if i.Description != "" {
157 st.Content = i.Description
158 }
159 if i.Title != "" {
160 st.Title = i.Title
161 } else if i.Description != "" {
162 st.Title = i.Description
163 }
164 if st.Content == st.Title {
165 st.Title = ""
166 }
167 st.Title = textTitle(st.Title)
168 if i.Guid != nil {
169 st.ID = i.Guid.Guid
170 }
171 if i.Enclosure != nil && strings.HasPrefix(i.Enclosure.Type, "audio/") {
172 st.MediaContent = i.Enclosure.Url
173 } else if i.Media != nil && strings.HasPrefix(i.Media.Type, "audio/") {
174 st.MediaContent = i.Media.URL
175 }
176 if t, err := parseDate(i.PubDate, i.Date, i.Published); err == nil {
177 st.Published = t
178 st.Updated = t
179 }
180 f.Items = append(f.Items, &st)
181 }
182
183 return &f, nil
184 }
185
186 func parseRDF(body []byte) (*feed, error) {
187 var f feed
188 var rd rdf.RDF
189 d := xml.NewDecoder(bytes.NewReader(body))
190 d.CharsetReader = charset.NewReaderLabel
191 if err := d.Decode(&rd); err != nil {
192 return nil, err
193 }
194 if rd.Channel != nil {
195 f.Title = rd.Channel.Title
196 f.Link = rd.Channel.Link
197 if t, err := parseDate(rd.Channel.Date); err == nil {
198 f.Updated = t
199 }
200 }
201
202 for _, i := range rd.Item {
203 st := item{
204 ID: i.About,
205 Title: textTitle(i.Title),
206 Link: i.Link,
207 Author: i.Creator,
208 }
209 if len(i.Description) > 0 {
210 st.Content = html.UnescapeString(i.Description)
211 } else if len(i.Content) > 0 {
212 st.Content = html.UnescapeString(i.Content)
213 }
214 if t, err := parseDate(i.Date); err == nil {
215 st.Published = t
216 st.Updated = t
217 }
218 f.Items = append(f.Items, &st)
219 }
220
221 return &f, nil
222 }
223
224 func textTitle(t string) string {
225 return html.UnescapeString(t)
226 }
227
228 func atomTitle(t *atom.Text) string {
229 if t == nil {
230 return ""
231 }
232 if t.Type == "html" {
233
234
235 }
236 return textTitle(t.Body)
237 }
238
239 func findBestAtomLink(links []atom.Link) string {
240 getScore := func(l atom.Link) int {
241 switch {
242 case l.Rel == "hub":
243 return 0
244 case l.Rel == "alternate" && l.Type == "text/html":
245 return 5
246 case l.Type == "text/html":
247 return 4
248 case l.Rel == "self":
249 return 2
250 case l.Rel == "":
251 return 3
252 default:
253 return 1
254 }
255 }
256
257 var bestlink string
258 bestscore := -1
259 for _, l := range links {
260 score := getScore(l)
261 if score > bestscore {
262 bestlink = l.Href
263 bestscore = score
264 }
265 }
266
267 return bestlink
268 }
269
270 var dateFormats = []string{
271 "01-02-2006",
272 "01/02/2006",
273 "01/02/2006 - 15:04",
274 "01/02/2006 15:04:05 MST",
275 "01/02/2006 3:04 PM",
276 "02-01-2006",
277 "02/01/2006",
278 "02.01.2006 -0700",
279 "02/01/2006 - 15:04",
280 "02.01.2006 15:04",
281 "02/01/2006 15:04:05",
282 "02.01.2006 15:04:05",
283 "02-01-2006 15:04:05 MST",
284 "02/01/2006 15:04 MST",
285 "02 Jan 2006",
286 "02 Jan 2006 15:04:05",
287 "02 Jan 2006 15:04:05 -0700",
288 "02 Jan 2006 15:04:05 MST",
289 "02 Jan 2006 15:04:05 UT",
290 "02 Jan 2006 15:04 MST",
291 "02 Monday, Jan 2006 15:04",
292 "06-1-2 15:04",
293 "06/1/2 15:04",
294 "1/2/2006",
295 "1/2/2006 15:04:05 MST",
296 "1/2/2006 3:04:05 PM",
297 "1/2/2006 3:04:05 PM MST",
298 "15:04 02.01.2006 -0700",
299 "2006-01-02",
300 "2006/01/02",
301 "2006-01-02 00:00:00.0 15:04:05.0 -0700",
302 "2006-01-02 15:04",
303 "2006-01-02 15:04:05 -0700",
304 "2006-01-02 15:04:05-07:00",
305 "2006-01-02 15:04:05-0700",
306 "2006-01-02 15:04:05 MST",
307 "2006-01-02 15:04:05Z",
308 "2006-01-02 at 15:04:05",
309 "2006-01-02T15:04:05",
310 "2006-01-02T15:04:05:00",
311 "2006-01-02T15:04:05 -0700",
312 "2006-01-02T15:04:05-07:00",
313 "2006-01-02T15:04:05-0700",
314 "2006-01-02T15:04:05:-0700",
315 "2006-01-02T15:04:05-07:00:00",
316 "2006-01-02T15:04:05Z",
317 "2006-01-02T15:04-07:00",
318 "2006-01-02T15:04Z",
319 "2006-1-02T15:04:05Z",
320 "2006-1-2",
321 "2006-1-2 15:04:05",
322 "2006-1-2T15:04:05Z",
323 "2006 January 02",
324 "2-1-2006",
325 "2/1/2006",
326 "2.1.2006 15:04:05",
327 "2 Jan 2006",
328 "2 Jan 2006 15:04:05 -0700",
329 "2 Jan 2006 15:04:05 MST",
330 "2 Jan 2006 15:04:05 Z",
331 "2 January 2006",
332 "2 January 2006 15:04:05 -0700",
333 "2 January 2006 15:04:05 MST",
334 "6-1-2 15:04",
335 "6/1/2 15:04",
336 "Jan 02, 2006",
337 "Jan 02 2006 03:04:05PM",
338 "Jan 2, 2006",
339 "Jan 2, 2006 15:04:05 MST",
340 "Jan 2, 2006 3:04:05 PM",
341 "Jan 2, 2006 3:04:05 PM MST",
342 "January 02, 2006",
343 "January 02, 2006 03:04 PM",
344 "January 02, 2006 15:04",
345 "January 02, 2006 15:04:05 MST",
346 "January 2, 2006",
347 "January 2, 2006 03:04 PM",
348 "January 2, 2006 15:04:05",
349 "January 2, 2006 15:04:05 MST",
350 "January 2, 2006, 3:04 p.m.",
351 "January 2, 2006 3:04 PM",
352 "Mon, 02 Jan 06 15:04:05 MST",
353 "Mon, 02 Jan 2006",
354 "Mon, 02 Jan 2006 15:04:05",
355 "Mon, 02 Jan 2006 15:04:05 00",
356 "Mon, 02 Jan 2006 15:04:05 -07",
357 "Mon 02 Jan 2006 15:04:05 -0700",
358 "Mon, 02 Jan 2006 15:04:05 --0700",
359 "Mon, 02 Jan 2006 15:04:05 -07:00",
360 "Mon, 02 Jan 2006 15:04:05 -0700",
361 "Mon,02 Jan 2006 15:04:05 -0700",
362 "Mon, 02 Jan 2006 15:04:05 GMT-0700",
363 "Mon , 02 Jan 2006 15:04:05 MST",
364 "Mon, 02 Jan 2006 15:04:05 MST",
365 "Mon, 02 Jan 2006 15:04:05MST",
366 "Mon, 02 Jan 2006, 15:04:05 MST",
367 "Mon, 02 Jan 2006 15:04:05 MST -0700",
368 "Mon, 02 Jan 2006 15:04:05 MST-07:00",
369 "Mon, 02 Jan 2006 15:04:05 UT",
370 "Mon, 02 Jan 2006 15:04:05 Z",
371 "Mon, 02 Jan 2006 15:04 -0700",
372 "Mon, 02 Jan 2006 15:04 MST",
373 "Mon,02 Jan 2006 15:04 MST",
374 "Mon, 02 Jan 2006 15 -0700",
375 "Mon, 02 Jan 2006 3:04:05 PM MST",
376 "Mon, 02 January 2006",
377 "Mon,02 January 2006 14:04:05 MST",
378 "Mon, 2006-01-02 15:04",
379 "Mon, 2 Jan 06 15:04:05 -0700",
380 "Mon, 2 Jan 06 15:04:05 MST",
381 "Mon, 2 Jan 15:04:05 MST",
382 "Mon, 2 Jan 2006",
383 "Mon,2 Jan 2006",
384 "Mon, 2 Jan 2006 15:04",
385 "Mon, 2 Jan 2006 15:04:05",
386 "Mon, 2 Jan 2006 15:04:05 -0700",
387 "Mon, 2 Jan 2006 15:04:05-0700",
388 "Mon, 2 Jan 2006 15:04:05 -0700 MST",
389 "mon,2 Jan 2006 15:04:05 MST",
390 "Mon 2 Jan 2006 15:04:05 MST",
391 "Mon, 2 Jan 2006 15:04:05 MST",
392 "Mon, 2 Jan 2006 15:04:05MST",
393 "Mon, 2 Jan 2006 15:04:05 UT",
394 "Mon, 2 Jan 2006 15:04 -0700",
395 "Mon, 2 Jan 2006, 15:04 -0700",
396 "Mon, 2 Jan 2006 15:04 MST",
397 "Mon, 2, Jan 2006 15:4",
398 "Mon, 2 Jan 2006 15:4:5 -0700 GMT",
399 "Mon, 2 Jan 2006 15:4:5 MST",
400 "Mon, 2 Jan 2006 3:04:05 PM -0700",
401 "Mon, 2 January 2006",
402 "Mon, 2 January 2006 15:04:05 -0700",
403 "Mon, 2 January 2006 15:04:05 MST",
404 "Mon, 2 January 2006, 15:04:05 MST",
405 "Mon, 2 January 2006, 15:04 -0700",
406 "Mon, 2 January 2006 15:04 MST",
407 "Monday, 02 January 2006 15:04:05",
408 "Monday, 02 January 2006 15:04:05 -0700",
409 "Monday, 02 January 2006 15:04:05 MST",
410 "Monday, 2 Jan 2006 15:04:05 -0700",
411 "Monday, 2 Jan 2006 15:04:05 MST",
412 "Monday, 2 January 2006 15:04:05 -0700",
413 "Monday, 2 January 2006 15:04:05 MST",
414 "Monday, January 02, 2006",
415 "Monday, January 2, 2006",
416 "Monday, January 2, 2006 03:04 PM",
417 "Monday, January 2, 2006 15:04:05 MST",
418 "Mon Jan 02 2006 15:04:05 -0700",
419 "Mon, Jan 02,2006 15:04:05 MST",
420 "Mon Jan 02, 2006 3:04 pm",
421 "Mon Jan 2 15:04:05 2006 MST",
422 "Mon Jan 2 15:04 2006",
423 "Mon, Jan 2 2006 15:04:05 -0700",
424 "Mon, Jan 2 2006 15:04:05 -700",
425 "Mon, Jan 2, 2006 15:04:05 MST",
426 "Mon, Jan 2 2006 15:04 MST",
427 "Mon, Jan 2, 2006 15:04 MST",
428 "Mon, January 02, 2006 15:04:05 MST",
429 "Mon, January 02, 2006, 15:04:05 MST",
430 "Mon, January 2 2006 15:04:05 -0700",
431 "Updated January 2, 2006",
432 time.ANSIC,
433 time.RFC1123,
434 time.RFC1123Z,
435 time.RFC3339,
436 time.RFC822,
437 time.RFC822Z,
438 time.RFC850,
439 time.RubyDate,
440 time.UnixDate,
441 }
442
443 func parseDate(ds ...string) (t time.Time, err error) {
444 for _, d := range ds {
445 d = strings.TrimSpace(d)
446 if d == "" {
447 continue
448 }
449 for _, f := range dateFormats {
450 if t, err = time.Parse(f, d); err == nil {
451 return
452 }
453 }
454 }
455 err = fmt.Errorf("could not parse dates: %v", strings.Join(ds, ", "))
456 return
457 }