# Defines two classes: # # BibTexEntry, subclass of BibEntry, and provides all BibTeX specific methods such as # writing an entry to file # # BibTex, a subclass of Bibliography, and provides all BibTeX specific methods, in # particular a parser. # Copyright (c) 2007, Peter Corke # # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * The name of the copyright holder may not be used to endorse or # promote products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. import Bibliography; import BibEntry; import string; import re; import sys; import urllib; class BibTeXEntry(BibEntry.BibEntry): # write a BibTex format entry def write(self, file=sys.stdout, stringdict=None): file.write( "@%s{%s,\n" % (self.getRefType(), self.getKey()) ); count = 0 for rk in self.fieldDict: count += 1; # skip internally used fields if rk[0] == '_': continue; if rk == 'Type': continue; # generate the entry value = self.fieldDict[rk]; file.write(" %s = " % rk ); if rk in ['Author', 'Editor']: file.write("{%s}" % " and ".join(value) ); elif rk == 'Month': if value: file.write("{%s}" % value ); else: value = self.getMonthName(); file.write("%s" % value[0:3].lower() ); else: # is it an abbrev? if value in self.bibliography.abbrevDict: file.write("%s" % value ); else: file.write("{%s}" % value ); # add comma to all but last fields if count < len(self.fieldDict): file.write(",\n"); else: file.write("\n"); file.write("}\n\n"); def setField(self, field, value): def strStrip(s): s = string.strip(s, ' '); if (s[0] == '"') and (s[-1] == '"'): return s[1:-1]; if (s[0] == '{') and (s[-1] == '}'): return s[1:-1]; return s; # deal specially with author list, convert from bibtex X and Y to # a list for bibentry class if field.lower() in ["author", "editor"]: value = string.split(value, " and "); value = map(strStrip, value); try: # invoke the superclass BibEntry.BibEntry.setField(self, field, value); except AttributeError, err: sys.stderr.write( "%15s: bad value <%s=%s>" % (self.getKey(), field, value)); class BibTeX(Bibliography.Bibliography): stringDict = {}; def parseFile(self, fileName=None, verbose=0, ignore=False): if fileName == None: fp = sys.stdin; else: fp = self.open(fileName); # get the file into one huge string nbib = 0; s = fp.read(); try: nbib = self.parseString(s, ignore=ignore, verbose=verbose); except AttributeError, err: print >> sys.stderr, "Error %s" % err; self.close(fp); return nbib; def display(self): for be in self: be.display() def write(self, file=sys.stdout, resolve=0): if resolve: dict = self.stringDict; else: dict = None; for be in self: be.write(file, dict) def writeStrings(self, file=sys.stdout): for abbrev, value in self.abbrevDict.items(): file.write("@string{ %s = {%s} }\n" % (abbrev, value) ); # resolve BibTeX's cross reference capability def resolveCrossRef(self): for be in self: try: xfref = self.getField('crossref'); except: return; for f in xref: if not (f in be): be.setField(f, xref.getField(f)); def parseString(self, s, verbose=0, ignore=False): # lexical analyzer for bibtex format files class BibLexer: inString = ""; # the string to parse lineNum = 1; pos = 0; def __init__(self, s): self.inString = s; # an iterator for the class, return next character def next(self): if self.pos >= len(self.inString): raise StopIteration; c = self.inString[self.pos]; if c == '\n': self.lineNum += 1; self.pos += 1; return c; def __iter__(self): return self; # peek at the next character def peek(self): return self.inString[self.pos]; # push a character back onto the input def pushback(self, c): self.pos -= 1; if c == '\n': self.lineNum -= 1; # eat whitepsace characters and comments def skipwhite(self): for c in self: if c == '%': for c in self: if c == '\n': break; elif (not c.isspace()): self.pushback(c); break; # print >> sys.stderr, the input buffer def show(self): print >> sys.stderr, "[%c]%s" % (self.inString[0], self.inString[1:10]); # get the next word from the input stream, this can be # [alpha][alnum$_-] # "...." # {....} def nextword(self): str = ""; c = self.peek(); if c == '"': # quote delimited string str = self.next(); cp = None; # prev char for c in self: str += c; if (c == '"') and (cp != '\\'): break; cp = c; elif c == '{': # brace delimited string count = 0; for c in self: if c == '{': count += 1; if c == '}': count -= 1; str += c; if count == 0: break; else: # undelimited string #if (not c.isalpha()): # print >> sys.stderr, "BAD STRING" for c in self: if c.isalnum(): str += c; elif c in ".+-_$:'": str += c; else: self.pushback(c); break; return str; class Token: t_ENTRY = 1; t_DELIM_L = 2; t_DELIM_R = 3; t_STRING = 5; t_EQUAL = 6; t_COMMA = 7; val = None; type = None; def __repr__(self): if self.type == self.t_ENTRY: str = "@ %s" % self.val; elif self.type == self.t_DELIM_R: str = " }"; elif self.type == self.t_STRING: str = "<%s>" % self.val; elif self.type == self.t_EQUAL: str = " EQUAL"; elif self.type == self.t_COMMA: str = " COMMA"; else: str = "BAD TOKEN (%d) <%s>" % (self.type, self.val); return str; def isstring(self): return self.type == self.t_STRING; def isabbrev(self): return (self.type == self.t_STRING) and self.val.isalnum(); def iscomma(self): return self.type == self.t_COMMA; def isequal(self): return self.type == self.t_EQUAL; def isentry(self): return self.type == self.t_ENTRY; def isdelimR(self): return self.type == self.t_DELIM_R; def isdelimL(self): return self.type == self.t_DELIM_L; # # tokenizer for bibtex format files # class BibTokenizer: lex = None; def __init__(self, s): self.lex = BibLexer(s); # setup an iterator for the next token def __iter__(self): return self; # return next token def next(self): #self.lex.show(); self.lex.skipwhite(); c = self.lex.next(); t = Token(); if c == '@': t.type = t.t_ENTRY; self.lex.skipwhite(); t.val = self.lex.nextword(); self.lex.skipwhite(); c = self.lex.next(); if not ((c == '{') or (c == '(')): print >> sys.stderr, "BAD START OF ENTRY" elif c == ',': t.type = t.t_COMMA; elif c == '=': t.type = t.t_EQUAL; elif (c == '}') or (c == ')'): t.type = t.t_DELIM_R; else: self.lex.pushback(c); t.type = t.t_STRING; t.val = self.lex.nextword(); return t; class BibParser: tok = None; bibtex = None; def __init__(self, s, bt): self.tok = BibTokenizer(s); self.bibtex = bt; # setup an iterator for the next entry def __iter__(self): return self; # return next entry def next(self): def strstrip(s): if s[0] in '"{': return s[1:-1]; else: return s; t = self.tok.next(); if not t.isentry(): raise SyntaxError, self.tok.lex.lineNum; if t.val.lower() == 'string': tn = self.tok.next(); if not tn.isstring(): raise SyntaxError, self.tok.lex.lineNum; t = self.tok.next(); if not t.isequal(): raise SyntaxError, self.tok.lex.lineNum; tv = self.tok.next(); if not tv.isstring(): raise SyntaxError, self.tok.lex.lineNum; # insert string into the string table self.bibtex.insertAbbrev(tn.val, strstrip(tv.val)); #print >> sys.stderr, "string", tn.val, tv.val t = self.tok.next(); if not t.isdelimR(): raise SyntaxError, self.tok.lex.lineNum; elif t.val.lower() == 'comment': depth = 0; while True: tn = self.tok.next(); if t.isdelimL(): depth += 1; if t.isdelimR(): depth -= 1; if depth == 0: break; else: # NOT A STRING or COMMENT ENTRY # assume a normal reference type # get the cite key ck = self.tok.next(); if not ck.isstring(): raise SyntaxError, self.tok.lex.lineNum; #print >> sys.stderr, t.val, ck.val be = BibTeXEntry(ck.val, self.bibtex); be.setType(t.val); # get the comma ck = self.tok.next(); if not ck.iscomma(): raise SyntaxError, self.tok.lex.lineNum; # get the field value pairs for tf in self.tok: # allow for poor syntax with comma before # end brace if tf.isdelimR(): break; if not tf.isstring(): raise SyntaxError, self.tok.lex.lineNum; t = self.tok.next(); if not t.isequal(): raise SyntaxError, self.tok.lex.lineNum; ts = self.tok.next(); if not ts.isstring(): raise SyntaxError, self.tok.lex.lineNum; #print >> sys.stderr, " ", tf.val, " := ", ts.val; be.setField(tf.val, strstrip(ts.val)); # if it was an abbrev in the file, put it in the # abbrevDict so it gets written as an abbrev if ts.isabbrev(): self.bibtex.insertAbbrev(ts.val, None); #print >> sys.stderr, "putting unresolved abbrev %s into dict" % ts.val; t = self.tok.next(); if t.iscomma(): continue; elif t.isdelimR(): break; else: raise SyntaxError, self.tok.lex.lineNum; self.bibtex.insertEntry(be, ignore); return; bibparser = BibParser(s, self); bibcount = 0; try: for be in bibparser: bibcount += 1; pass; except SyntaxError, err: print "Syntax error at line " + str(err); return bibcount;