Home Download Docs Code Community
     1	/*
     2	Copyright 2014 The Perkeep Authors
     3	
     4	Licensed under the Apache License, Version 2.0 (the "License");
     5	you may not use this file except in compliance with the License.
     6	You may obtain a copy of the License at
     7	
     8	     http://www.apache.org/licenses/LICENSE-2.0
     9	
    10	Unless required by applicable law or agreed to in writing, software
    11	distributed under the License is distributed on an "AS IS" BASIS,
    12	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13	See the License for the specific language governing permissions and
    14	limitations under the License.
    15	*/
    16	
    17	package feed
    18	
    19	import (
    20		"bytes"
    21		"encoding/xml"
    22		"fmt"
    23		"html"
    24		"log"
    25		"net/url"
    26		"strings"
    27		"time"
    28	
    29		"golang.org/x/net/html/charset"
    30		"perkeep.org/pkg/importer/feed/atom"
    31		"perkeep.org/pkg/importer/feed/rdf"
    32		"perkeep.org/pkg/importer/feed/rss"
    33	)
    34	
    35	type feed struct {
    36		Title   string
    37		Updated time.Time
    38		Link    string
    39		Items   []*item
    40	}
    41	
    42	type item struct {
    43		ID           string
    44		Title        string
    45		Link         string
    46		Created      time.Time
    47		Published    time.Time
    48		Updated      time.Time
    49		Author       string
    50		Content      string
    51		MediaContent string
    52	}
    53	
    54	func parseFeed(body []byte, feedURL string) (*feed, error) {
    55		var f *feed
    56		var atomerr, rsserr, rdferr error
    57		f, atomerr = parseAtom(body)
    58		if f == nil {
    59			f, rsserr = parseRSS(body)
    60		}
    61		if f == nil {
    62			f, rdferr = parseRDF(body)
    63		}
    64		if f == nil {
    65			log.Printf("atom parse error: %s", atomerr.Error())
    66			log.Printf("xml parse error: %s", rsserr.Error())
    67			log.Printf("rdf parse error: %s", rdferr.Error())
    68			return nil, fmt.Errorf("Could not parse feed data")
    69		}
    70		return f, nil
    71	}
    72	
    73	func parseAtom(body []byte) (*feed, error) {
    74		var f feed
    75		var a atom.Feed
    76		d := xml.NewDecoder(bytes.NewReader(body))
    77		d.CharsetReader = charset.NewReaderLabel
    78		if err := d.Decode(&a); err != nil {
    79			return nil, err
    80		}
    81		f.Title = a.Title
    82		if t, err := parseDate(string(a.Updated)); err == nil {
    83			f.Updated = t
    84		}
    85		fb, err := url.Parse(a.XMLBase)
    86		if err != nil {
    87			fb, _ = url.Parse("")
    88		}
    89		if len(a.Link) > 0 {
    90			f.Link = findBestAtomLink(a.Link)
    91			if l, err := fb.Parse(f.Link); err == nil {
    92				f.Link = l.String()
    93			}
    94		}
    95	
    96		for _, i := range a.Entry {
    97			eb, err := fb.Parse(i.XMLBase)
    98			if err != nil {
    99				eb = fb
   100			}
   101			st := item{
   102				ID:    i.ID,
   103				Title: atomTitle(i.Title),
   104			}
   105			if t, err := parseDate(string(i.Updated)); err == nil {
   106				st.Updated = t
   107			}
   108			if t, err := parseDate(string(i.Published)); err == nil {
   109				st.Published = t
   110			}
   111			if len(i.Link) > 0 {
   112				st.Link = findBestAtomLink(i.Link)
   113				if l, err := eb.Parse(st.Link); err == nil {
   114					st.Link = l.String()
   115				}
   116			}
   117			if i.Author != nil {
   118				st.Author = i.Author.Name
   119			}
   120			if i.Content != nil {
   121				if len(strings.TrimSpace(i.Content.Body)) != 0 {
   122					st.Content = i.Content.Body
   123				} else if len(i.Content.InnerXML) != 0 {
   124					st.Content = i.Content.InnerXML
   125				}
   126			} else if i.Summary != nil {
   127				st.Content = i.Summary.Body
   128			}
   129			f.Items = append(f.Items, &st)
   130		}
   131		return &f, nil
   132	}
   133	
   134	func parseRSS(body []byte) (*feed, error) {
   135		var f feed
   136		var r rss.RSS
   137		d := xml.NewDecoder(bytes.NewReader(body))
   138		d.CharsetReader = charset.NewReaderLabel
   139		d.DefaultSpace = "DefaultSpace"
   140		if err := d.Decode(&r); err != nil {
   141			return nil, err
   142		}
   143		f.Title = r.Title
   144		if t, err := parseDate(r.LastBuildDate, r.PubDate); err == nil {
   145			f.Updated = t
   146		}
   147		f.Link = r.BaseLink()
   148	
   149		for _, i := range r.Items {
   150			st := item{
   151				Link:   i.Link,
   152				Author: i.Author,
   153			}
   154			if i.Content != "" {
   155				st.Content = i.Content
   156			} else if i.Description != "" {
   157				st.Content = i.Description
   158			}
   159			if i.Title != "" {
   160				st.Title = i.Title
   161			} else if i.Description != "" {
   162				st.Title = i.Description
   163			}
   164			if st.Content == st.Title {
   165				st.Title = ""
   166			}
   167			st.Title = textTitle(st.Title)
   168			if i.Guid != nil {
   169				st.ID = i.Guid.Guid
   170			}
   171			if i.Enclosure != nil && strings.HasPrefix(i.Enclosure.Type, "audio/") {
   172				st.MediaContent = i.Enclosure.Url
   173			} else if i.Media != nil && strings.HasPrefix(i.Media.Type, "audio/") {
   174				st.MediaContent = i.Media.URL
   175			}
   176			if t, err := parseDate(i.PubDate, i.Date, i.Published); err == nil {
   177				st.Published = t
   178				st.Updated = t
   179			}
   180			f.Items = append(f.Items, &st)
   181		}
   182	
   183		return &f, nil
   184	}
   185	
   186	func parseRDF(body []byte) (*feed, error) {
   187		var f feed
   188		var rd rdf.RDF
   189		d := xml.NewDecoder(bytes.NewReader(body))
   190		d.CharsetReader = charset.NewReaderLabel
   191		if err := d.Decode(&rd); err != nil {
   192			return nil, err
   193		}
   194		if rd.Channel != nil {
   195			f.Title = rd.Channel.Title
   196			f.Link = rd.Channel.Link
   197			if t, err := parseDate(rd.Channel.Date); err == nil {
   198				f.Updated = t
   199			}
   200		}
   201	
   202		for _, i := range rd.Item {
   203			st := item{
   204				ID:     i.About,
   205				Title:  textTitle(i.Title),
   206				Link:   i.Link,
   207				Author: i.Creator,
   208			}
   209			if len(i.Description) > 0 {
   210				st.Content = html.UnescapeString(i.Description)
   211			} else if len(i.Content) > 0 {
   212				st.Content = html.UnescapeString(i.Content)
   213			}
   214			if t, err := parseDate(i.Date); err == nil {
   215				st.Published = t
   216				st.Updated = t
   217			}
   218			f.Items = append(f.Items, &st)
   219		}
   220	
   221		return &f, nil
   222	}
   223	
   224	func textTitle(t string) string {
   225		return html.UnescapeString(t)
   226	}
   227	
   228	func atomTitle(t *atom.Text) string {
   229		if t == nil {
   230			return ""
   231		}
   232		if t.Type == "html" {
   233			// see: https://github.com/mjibson/goread/blob/59aec794f3ef87b36c1bac029438c33a6aa6d8d3/utils.go#L533
   234			//return html.UnescapeString(sanitizer.StripTags(t.Body))
   235		}
   236		return textTitle(t.Body)
   237	}
   238	
   239	func findBestAtomLink(links []atom.Link) string {
   240		getScore := func(l atom.Link) int {
   241			switch {
   242			case l.Rel == "hub":
   243				return 0
   244			case l.Rel == "alternate" && l.Type == "text/html":
   245				return 5
   246			case l.Type == "text/html":
   247				return 4
   248			case l.Rel == "self":
   249				return 2
   250			case l.Rel == "":
   251				return 3
   252			default:
   253				return 1
   254			}
   255		}
   256	
   257		var bestlink string
   258		bestscore := -1
   259		for _, l := range links {
   260			score := getScore(l)
   261			if score > bestscore {
   262				bestlink = l.Href
   263				bestscore = score
   264			}
   265		}
   266	
   267		return bestlink
   268	}
   269	
   270	var dateFormats = []string{
   271		"01-02-2006",
   272		"01/02/2006",
   273		"01/02/2006 - 15:04",
   274		"01/02/2006 15:04:05 MST",
   275		"01/02/2006 3:04 PM",
   276		"02-01-2006",
   277		"02/01/2006",
   278		"02.01.2006 -0700",
   279		"02/01/2006 - 15:04",
   280		"02.01.2006 15:04",
   281		"02/01/2006 15:04:05",
   282		"02.01.2006 15:04:05",
   283		"02-01-2006 15:04:05 MST",
   284		"02/01/2006 15:04 MST",
   285		"02 Jan 2006",
   286		"02 Jan 2006 15:04:05",
   287		"02 Jan 2006 15:04:05 -0700",
   288		"02 Jan 2006 15:04:05 MST",
   289		"02 Jan 2006 15:04:05 UT",
   290		"02 Jan 2006 15:04 MST",
   291		"02 Monday, Jan 2006 15:04",
   292		"06-1-2 15:04",
   293		"06/1/2 15:04",
   294		"1/2/2006",
   295		"1/2/2006 15:04:05 MST",
   296		"1/2/2006 3:04:05 PM",
   297		"1/2/2006 3:04:05 PM MST",
   298		"15:04 02.01.2006 -0700",
   299		"2006-01-02",
   300		"2006/01/02",
   301		"2006-01-02 00:00:00.0 15:04:05.0 -0700",
   302		"2006-01-02 15:04",
   303		"2006-01-02 15:04:05 -0700",
   304		"2006-01-02 15:04:05-07:00",
   305		"2006-01-02 15:04:05-0700",
   306		"2006-01-02 15:04:05 MST",
   307		"2006-01-02 15:04:05Z",
   308		"2006-01-02 at 15:04:05",
   309		"2006-01-02T15:04:05",
   310		"2006-01-02T15:04:05:00",
   311		"2006-01-02T15:04:05 -0700",
   312		"2006-01-02T15:04:05-07:00",
   313		"2006-01-02T15:04:05-0700",
   314		"2006-01-02T15:04:05:-0700",
   315		"2006-01-02T15:04:05-07:00:00",
   316		"2006-01-02T15:04:05Z",
   317		"2006-01-02T15:04-07:00",
   318		"2006-01-02T15:04Z",
   319		"2006-1-02T15:04:05Z",
   320		"2006-1-2",
   321		"2006-1-2 15:04:05",
   322		"2006-1-2T15:04:05Z",
   323		"2006 January 02",
   324		"2-1-2006",
   325		"2/1/2006",
   326		"2.1.2006 15:04:05",
   327		"2 Jan 2006",
   328		"2 Jan 2006 15:04:05 -0700",
   329		"2 Jan 2006 15:04:05 MST",
   330		"2 Jan 2006 15:04:05 Z",
   331		"2 January 2006",
   332		"2 January 2006 15:04:05 -0700",
   333		"2 January 2006 15:04:05 MST",
   334		"6-1-2 15:04",
   335		"6/1/2 15:04",
   336		"Jan 02, 2006",
   337		"Jan 02 2006 03:04:05PM",
   338		"Jan 2, 2006",
   339		"Jan 2, 2006 15:04:05 MST",
   340		"Jan 2, 2006 3:04:05 PM",
   341		"Jan 2, 2006 3:04:05 PM MST",
   342		"January 02, 2006",
   343		"January 02, 2006 03:04 PM",
   344		"January 02, 2006 15:04",
   345		"January 02, 2006 15:04:05 MST",
   346		"January 2, 2006",
   347		"January 2, 2006 03:04 PM",
   348		"January 2, 2006 15:04:05",
   349		"January 2, 2006 15:04:05 MST",
   350		"January 2, 2006, 3:04 p.m.",
   351		"January 2, 2006 3:04 PM",
   352		"Mon, 02 Jan 06 15:04:05 MST",
   353		"Mon, 02 Jan 2006",
   354		"Mon, 02 Jan 2006 15:04:05",
   355		"Mon, 02 Jan 2006 15:04:05 00",
   356		"Mon, 02 Jan 2006 15:04:05 -07",
   357		"Mon 02 Jan 2006 15:04:05 -0700",
   358		"Mon, 02 Jan 2006 15:04:05 --0700",
   359		"Mon, 02 Jan 2006 15:04:05 -07:00",
   360		"Mon, 02 Jan 2006 15:04:05 -0700",
   361		"Mon,02 Jan 2006 15:04:05 -0700",
   362		"Mon, 02 Jan 2006 15:04:05 GMT-0700",
   363		"Mon , 02 Jan 2006 15:04:05 MST",
   364		"Mon, 02 Jan 2006 15:04:05 MST",
   365		"Mon, 02 Jan 2006 15:04:05MST",
   366		"Mon, 02 Jan 2006, 15:04:05 MST",
   367		"Mon, 02 Jan 2006 15:04:05 MST -0700",
   368		"Mon, 02 Jan 2006 15:04:05 MST-07:00",
   369		"Mon, 02 Jan 2006 15:04:05 UT",
   370		"Mon, 02 Jan 2006 15:04:05 Z",
   371		"Mon, 02 Jan 2006 15:04 -0700",
   372		"Mon, 02 Jan 2006 15:04 MST",
   373		"Mon,02 Jan 2006 15:04 MST",
   374		"Mon, 02 Jan 2006 15 -0700",
   375		"Mon, 02 Jan 2006 3:04:05 PM MST",
   376		"Mon, 02 January 2006",
   377		"Mon,02 January 2006 14:04:05 MST",
   378		"Mon, 2006-01-02 15:04",
   379		"Mon, 2 Jan 06 15:04:05 -0700",
   380		"Mon, 2 Jan 06 15:04:05 MST",
   381		"Mon, 2 Jan 15:04:05 MST",
   382		"Mon, 2 Jan 2006",
   383		"Mon,2 Jan 2006",
   384		"Mon, 2 Jan 2006 15:04",
   385		"Mon, 2 Jan 2006 15:04:05",
   386		"Mon, 2 Jan 2006 15:04:05 -0700",
   387		"Mon, 2 Jan 2006 15:04:05-0700",
   388		"Mon, 2 Jan 2006 15:04:05 -0700 MST",
   389		"mon,2 Jan 2006 15:04:05 MST",
   390		"Mon 2 Jan 2006 15:04:05 MST",
   391		"Mon, 2 Jan 2006 15:04:05 MST",
   392		"Mon, 2 Jan 2006 15:04:05MST",
   393		"Mon, 2 Jan 2006 15:04:05 UT",
   394		"Mon, 2 Jan 2006 15:04 -0700",
   395		"Mon, 2 Jan 2006, 15:04 -0700",
   396		"Mon, 2 Jan 2006 15:04 MST",
   397		"Mon, 2, Jan 2006 15:4",
   398		"Mon, 2 Jan 2006 15:4:5 -0700 GMT",
   399		"Mon, 2 Jan 2006 15:4:5 MST",
   400		"Mon, 2 Jan 2006 3:04:05 PM -0700",
   401		"Mon, 2 January 2006",
   402		"Mon, 2 January 2006 15:04:05 -0700",
   403		"Mon, 2 January 2006 15:04:05 MST",
   404		"Mon, 2 January 2006, 15:04:05 MST",
   405		"Mon, 2 January 2006, 15:04 -0700",
   406		"Mon, 2 January 2006 15:04 MST",
   407		"Monday, 02 January 2006 15:04:05",
   408		"Monday, 02 January 2006 15:04:05 -0700",
   409		"Monday, 02 January 2006 15:04:05 MST",
   410		"Monday, 2 Jan 2006 15:04:05 -0700",
   411		"Monday, 2 Jan 2006 15:04:05 MST",
   412		"Monday, 2 January 2006 15:04:05 -0700",
   413		"Monday, 2 January 2006 15:04:05 MST",
   414		"Monday, January 02, 2006",
   415		"Monday, January 2, 2006",
   416		"Monday, January 2, 2006 03:04 PM",
   417		"Monday, January 2, 2006 15:04:05 MST",
   418		"Mon Jan 02 2006 15:04:05 -0700",
   419		"Mon, Jan 02,2006 15:04:05 MST",
   420		"Mon Jan 02, 2006 3:04 pm",
   421		"Mon Jan 2 15:04:05 2006 MST",
   422		"Mon Jan 2 15:04 2006",
   423		"Mon, Jan 2 2006 15:04:05 -0700",
   424		"Mon, Jan 2 2006 15:04:05 -700",
   425		"Mon, Jan 2, 2006 15:04:05 MST",
   426		"Mon, Jan 2 2006 15:04 MST",
   427		"Mon, Jan 2, 2006 15:04 MST",
   428		"Mon, January 02, 2006 15:04:05 MST",
   429		"Mon, January 02, 2006, 15:04:05 MST",
   430		"Mon, January 2 2006 15:04:05 -0700",
   431		"Updated January 2, 2006",
   432		time.ANSIC,
   433		time.RFC1123,
   434		time.RFC1123Z,
   435		time.RFC3339,
   436		time.RFC822,
   437		time.RFC822Z,
   438		time.RFC850,
   439		time.RubyDate,
   440		time.UnixDate,
   441	}
   442	
   443	func parseDate(ds ...string) (t time.Time, err error) {
   444		for _, d := range ds {
   445			d = strings.TrimSpace(d)
   446			if d == "" {
   447				continue
   448			}
   449			for _, f := range dateFormats {
   450				if t, err = time.Parse(f, d); err == nil {
   451					return
   452				}
   453			}
   454		}
   455		err = fmt.Errorf("could not parse dates: %v", strings.Join(ds, ", "))
   456		return
   457	}
Website layout inspired by memcached.
Content by the authors.