Last update: 2011-07-13 12:21:19
This builds an index of mp3 files (using Mutagen library) and allows for keyword search in it. Requires Mutagen (install with "easy_install mutagen")and SQLite support in Python (should be there in the latest versions).
I considered using Whoosh search engine (very similar to lucene) in the first place but it does more than necessary. Yet, I'm still after a good soundex implementation for Python.
#!/usr/bin/env python
import os
import sys
import mutagen
import sqlite3
import unicodedata
import re
import time
# change this path to your sqlite database
dsn = '/Users/mickael/python_sandbox/tags/id3.sqlite'
class Analyzer:
"""
Analyze string and remove stop words
"""
def __init__(self):
self.stop_words = ['los','las','el','the','of','and','le','de','a','des','une','un','s','is','www','http','com','org']
def analyze(self, text):
words = []
text = self.strip_accents(text)
text = re.compile('[\'`?"]').sub(" ", text)
text = re.compile('[^A-Za-z0-9]').sub(" ", text)
for word in text.split(" "):
word = word.strip()
if word != "" and not word in self.stop_words:
if not isinstance(word, unicode):
words.append(word.lower())
else:
words.append(word.lower())
return words
def strip_accents(self,s):
s = unicode(s)
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
class ID3:
def __init__(self,path):
self._load(path)
def _load(self, filename):
short_tags = full_tags = mutagen.File(filename)
comments = []
if isinstance(full_tags, mutagen.mp3.MP3):
for key in short_tags:
if key[0:4] == 'COMM':
if(short_tags[key].desc == ''):
comments.append(short_tags[key].text[0])
short_tags = mutagen.mp3.MP3(filename, ID3 = mutagen.easyid3.EasyID3)
comments.append('');
self.album = short_tags.get('album', [''])[0]
self.artist = short_tags.get('artist', [''])[0]
self.duration = "%u:%.2d" % (full_tags.info.length / 60, full_tags.info.length % 60)
self.length = full_tags.info.length
self.title = short_tags.get('title', [''])[0]
self.comment = comments[0]
self.genre = ''
genres = short_tags.get('genre', [''])
if len(genres) > 0:
self.genre = genres[0]
self.size = os.stat(filename).st_size
class Index:
def build(self,start):
errors = []
analyzer = Analyzer()
cnx = self.db()
cursor = cnx.cursor()
cursor.execute("DELETE FROM id3index;")
cursor.execute("DELETE FROM id3;")
for root, dir, files in os.walk(start):
for name in files:
if name[-4:].lower() == '.mp3':
path = os.path.join(root,name)
print name
try:
id3 = ID3(path)
except:
errors.append(path)
id3 = None
if id3 != None:
cursor.execute("INSERT INTO id3(location, artist, title, album, genre, comment, duration, length, size) VALUES(?,?,?,?,?,?,?,?,?)",
(path,id3.artist,id3.title,id3.album,id3.genre,id3.comment,id3.duration,id3.length,id3.size))
last_id3_id = cursor.lastrowid
for field in ['artist', 'title', 'album', 'comment', 'genre']:
words = analyzer.analyze(getattr(id3, field))
for word in words:
cursor.execute("INSERT INTO id3index(id3_id,keyword,field) VALUES (?,?,?);", (str(last_id3_id), word, field))
cursor.execute('SELECT COUNT(*) AS nbrows FROM id3index LIMIT 1;')
for line in cursor:
print 'index size: ' + str(line["nbrows"])
cnx.commit()
if len(errors) > 0:
print ""
print "---- Errors ----"
print ""
for error in errors:
print error
def search(self,query):
cnx = self.db()
analyzer = Analyzer()
clauses = []
for word in analyzer.analyze(query):
clauses.append("id3_id IN(SELECT id3_id FROM id3index WHERE keyword LIKE '" + str(word) + "')")
cursor = cnx.cursor()
q = 'SELECT COUNT(id3index.id) AS score, id3_id, id3.* from id3index join id3 on id3.id = id3index.id3_id where ' + ' AND '.join(clauses) + ' GROUP BY id3_id ORDER BY score DESC'
cursor.execute(q)
for line in cursor:
print line["location"]
def db(self):
if getattr(self,"database", None) == None:
self.database = sqlite3.connect(dsn)
self.database.row_factory = sqlite3.Row
self.database.text_factory = str
cursor = self.database.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS id3index(id INTEGER PRIMARY KEY AUTOINCREMENT,id3_id, keyword, field)")
cursor.execute("CREATE TABLE IF NOT EXISTS id3(id INTEGER PRIMARY KEY AUTOINCREMENT,location UNIQUE, artist, title, album, genre, comment, duration, length, size)")
cursor.execute("CREATE INDEX IF NOT EXISTS keyword_idx ON id3index(keyword)")
return self.database
if __name__ == '__main__':
if len(sys.argv) < 3:
print 'Usage: tags.py index-build [your music dir]'
else:
index = Index()
if sys.argv[1] == 'index-build':
index.build(sys.argv[2])
elif sys.argv[1] == 'search':
index.search(sys.argv[2])
$ ./tags.py index-build /your/music/dir
$ ./tags.py search "pendulum colour"
This should print a list lilke this:
$/Volumes/MYBOOK/music/p/pendulum/hold_your_colour/girl_in_the_fire_-_original_mix.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/hold_your_colour_-_original_mix.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/fasten_your_seatbelt_-_original_mix.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/tarantula_-_original_mix.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/hold_you_color.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/the_stream_one.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/sound_of_life.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/out_there.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/through_the_loop.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/pendulum_intro.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/pendulum_intro2.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/the_terminal.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/plasticworld.mp3 /Volumes/MYBOOK/music/p/pendulum/hold_your_colour/slam.mp3
Then you can turn it into a playlist:
$ ./tags.py search "pendulum colour" > pendulum_-_hold_your_colour.m3u