Downloading your Blogger archives
Tags [ archives, Atom, backup, Blogger, CSV, download, Python ]
A friend was looking for a way to grab an archive of his Blogger posts into a CSV file he could do text mining on (and presumably, for a low-fi backup mechanism). I wrote this Python script for him, enjoy.
#!/usr/bin/env python
#
# Copyright (C) 2009 by Jon Moore
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import urllib2
import unicodedata
import xml.etree.ElementTree as etree
blog_feed = 'http://codeartisan.blogspot.com/feeds/posts/default'
output = 'posts.csv'
ATOM_NS = 'http://www.w3.org/2005/Atom'
def norm(s):
if not s: return None
return s.encode('ascii','ignore')
def main():
f = open(output, 'wb')
csv_wr = csv.writer(f)
url = blog_feed + '?max-results=100'
csv_wr.writerow(['id','published','updated','permalink','title','content'])
while url:
print "fetching", url
feed = etree.fromstring(urllib2.urlopen(url).read())
for entry in feed.findall("{%s}entry" % ATOM_NS):
id = entry.find("{%s}id" % ATOM_NS).text
published = entry.find("{%s}published" % ATOM_NS).text
updated = entry.find("{%s}updated" % ATOM_NS).text
title = norm(entry.find("{%s}title" % ATOM_NS).text)
content = norm(entry.find("{%s}content" % ATOM_NS).text)
perm_url = ''
for link in entry.findall("{%s}link" % ATOM_NS):
if (link.get('rel') == 'alternate'
and link.get('type') == 'text/html'):
perm_url = link.get('href')
break
csv_wr.writerow([id,published,updated,perm_url,title,content])
print "wrote",id
url = None
for link in feed.findall("{%s}link" % ATOM_NS):
if link.get('rel') == 'next':
url = link.get('href')
break
f.close()
if __name__ == "__main__":
main()