/* This is a lex(1) file, see http://dinosaur.compilertools.net/ * or http://en.wikipedia.org/wiki/Lex_programming_tool . * * Compilation on UNIX systems is done by * make wileyEndNoteFilt * On other systems one may need to call lex or flex and cc explicitly: * lex -8 -o wileyEndNoteFilt.c wileyEndNoteFilt.l * cc [-s] [-O] [-D MSDOS] [-D WILEYENDNOTERIS] -o wileyEndNoteFilt wileyEndNoteFilt.c -ll * * Definition of the macro MSDOS means that the output is rewritten * with CR+LF line terminators (which alternatively can be achieved * by the standard unix2dos(1)). * * Definition of the macro WILEYENDNOTERIS means that the RIS type tags * "A1 - " and "A2 - " are handled supposing that there is only * one name per line and swapping first and last names. * * The executable works as a filter and converts author names produced * by downloading EndNote files from Wiley's web pages such that they become * standardized. This means: the initials (first names) and last names of authors * are swapped and separated by a comma on output, and multiple * authors become separate lines. * * Use examples: * cat *.enw | wileyEndNoteFilt * cat *.enw | recode h1..h4 | wileyEndNoteFilt > tmp.end ; end2xml tmp.end > tmp.xml * * Notes: * The filter changes lines of the %A or %E tag containing at least one author. * This means (i) the author list is only parsed until a line feed, so author * lists continued on lines that do not start with another tag will remain * incomplete * (ii) a tag followed by no author is copied through unchanged. * * Richard J. Mathar, 2010-01-21 */ %option noyywrap %{ #include #include /* remove trailing carriage return and line feeds from inp * (this string may be changed on output). */ void trimlf(char *inp) { /* point to the last non-0 character */ char *c = inp+strlen(inp)-1 ; while ( *c == '\n' || *c == '\r' ) *c-- = '\0' ; } /* remove blanks at the start and/or end of inp * (this string may be changed on output). */ void trim(char *inp) { /* point to the last non-0 character */ char *c = inp+strlen(inp)-1 ; while ( *c == ' ') *c-- = '\0' ; while( *inp == ' ') memmove(inp,inp+1,strlen(inp)) ; } #define WILEYENDNOTEFILT_MAX_AUTH 2048 #define WILEYENDNOTEFILT_MAX_TRAIL 32 /* Handle a single name without any separator * @param inp A string of the form "firstname middle-name lastname". */ void revStr1(char *inp, const char *tag) { /* Search for the last blank, usually right before the last name. * This does not yet handle cases like Marvin D. den Anker * with composite last names, or Henry Ford II with roman numbers like I * or II following... */ char * lastbl=strrchr(inp,' ') ; if ( lastbl != inp ) { /* the revised string: lastname, firstinitls[, trail]; */ char resul[WILEYENDNOTEFILT_MAX_AUTH] ; /* a trailing portion like Jr., II etc. German cases like Dr. Werner von Braun, where "Dr." is * part of the trialing portion but syntactically in front of the first name are not handled * correctly. Similarly "Pope Paul Benedict" will yield strange results. * We also assume that the roman numerals are not followed by dots. */ char trail[WILEYENDNOTEFILT_MAX_TRAIL] ; int lastl ; memset(trail,'\0',WILEYENDNOTEFILT_MAX_TRAIL) ; /* back up if this is a composite name followed by Jr., Sr. or a dotless Roman numeral from I to VII */ if ( strcmp(lastbl," I")==0 || strcmp(lastbl," II")==0 || strcmp(lastbl," III")==0 || strcmp(lastbl," Jr.")==0 || strcmp(lastbl," IV")==0 || strcmp(lastbl," V")==0 || strcmp(lastbl," VI")==0 || strcmp(lastbl," VII")==0 || strcmp(lastbl," Sr.")==0 ) { /* copy the trailing piece over and chop it off the original string */ sprintf(trail,",%s",lastbl) ; *lastbl = '\0' ; lastbl=strrchr(inp,' ') ; } /* Start of the surname. Derived names of Spanish, Dutch or German origin. Does not catch the German * "Graefin" or similar titles which use diacritical vocals. * Start with the longest matches (that is: catch the "Baron" in "Baron von Munchhausen", not the "von"). */ char * surn= strstr(inp," Baron ") ; /* triggers also Baron von Munchhausen, for example */ if ( surn == NULL) surn= strstr(inp," Duke ") ; if ( surn == NULL) surn= strstr(inp," Earl ") ; if ( surn == NULL) surn= strstr(inp," Graf ") ; /* triggers also "Graf von", "Graf zu" etc */ if ( surn == NULL) surn= strstr(inp," Gr\344fin ") ; /* assume U+00E4 for the diaresis in UTF-8 */ if ( surn == NULL) surn= strstr(inp," Herzog ") ; if ( surn == NULL) surn= strstr(inp," v. d. ") ; if ( surn == NULL) surn= strstr(inp," von ") ; /* triggers also on "von der" */ if ( surn == NULL) surn= strstr(inp," Von ") ; if ( surn == NULL) surn= strstr(inp," van ") ; /* triggers also "van den", "van der", "van de" */ if ( surn == NULL) surn= strstr(inp," Van ") ; /* Van Morrisson is intepreted as a last name */ if ( surn == NULL) surn= strstr(inp," da ") ; if ( surn == NULL) surn= strstr(inp," Da ") ; if ( surn == NULL) surn= strstr(inp," de ") ; if ( surn == NULL) surn= strstr(inp," De ") ; if ( surn == NULL) surn= strstr(inp," du ") ; if ( surn == NULL) surn= strstr(inp," Du ") ; if ( surn == NULL) surn= strstr(inp," do ") ; if ( surn == NULL) surn= strstr(inp," Do ") ; if ( surn == NULL) surn= strstr(inp," della ") ; if ( surn == NULL) surn= strstr(inp," Della ") ; if ( surn == NULL) surn= strstr(inp," le ") ; if ( surn == NULL) surn= strstr(inp," Le ") ; if ( surn == NULL) surn= strstr(inp," dos ") ; if ( surn == NULL) surn= strstr(inp," Dos ") ; if ( surn == NULL) surn= strstr(inp," ter ") ; if ( surn == NULL) surn= strstr(inp," Ter ") ; if ( surn == NULL) surn= strstr(inp," ten ") ; if ( surn == NULL) surn= strstr(inp," Ten ") ; /* the cases "e" and "i" are handled unsafely here: * probably another component of the family name * precedes these */ if ( surn == NULL) surn= strstr(inp," e ") ; if ( surn == NULL) surn= strstr(inp," E ") ; if ( surn == NULL) surn= strstr(inp," i ") ; if ( surn == NULL) surn= strstr(inp," de ") ; if ( surn == NULL) surn= strstr(inp," De ") ; if ( surn == NULL) surn= strstr(inp," den ") ; if ( surn == NULL) surn= strstr(inp," Den ") ; /* if no such modifier is found: assume the last name starts at the last blank */ if ( surn == NULL) surn= lastbl ; lastl= strlen(inp)-(surn-inp)-1 ; memset(resul,'\0',WILEYENDNOTEFILT_MAX_AUTH) ; strncpy(resul,surn+1,lastl) ; if ( surn != inp) { /* output separator comma: lastname, firstname. Firstname starts with a blank. */ strcat(resul,",") ; #ifdef WILEYENDNOTEFILT_RIS strcat(resul," ") ; #endif strncat(resul,inp,surn-inp) ; } /* if there is a trailing portion append. Usually this is * just '\0' and appending does not hurt. */ strcat(resul,trail) ; trim(resul) ; #ifdef MSDOS fprintf(yyout,"%s %s\r\n",tag,resul) ; #else fprintf(yyout,"%s %s\n",tag,resul) ; #endif } else /* If there is no blank, this is probably a single last name, which we copy as is * (without comma) to stdout. */ #ifdef MSDOS fprintf(yyout,"%s%s\r\n",tag,inp) ; #else fprintf(yyout,"%s%s\n",tag,inp) ; #endif } #undef WILEYENDNOTEFILT_MAX_AUTH #undef WILEYENDNOTEFILT_MAX_TRAIL /** Split the line (without the starting "%A " or "%E " EndNote tag) into * the individual authors (which are each followed by a comma). * @param inp A string of the form "first-author, second-author, last-author," * or "first-author, second-author, last-author" or "first-author". * @param tag The "%A" or "%E" */ void revStr(const char *inp, const char *tag) { /* strtok(3) modifies 'inp', so we construct a temporary copy. */ char *inpsave = (char *) malloc((strlen(inp)+1)*sizeof(char)) ; char *tok ; char *tokreent ; strcpy(inpsave,inp) ; trimlf(inpsave) ; /* Loop over all terminating separators (=commas) */ tok = inpsave ; #ifdef WILEYENDNOTEFILT_RIS trim(tok) ; revStr1(tok,tag) ; #else tok = strtok_r(inpsave,",",&tokreent) ; while(tok) { /* skip leading white space in author names while(*tok == ' ') tok++ ; */ /* pass the "firstinit secndinit lastname" to the subroutine * without the comma separator. */ revStr1(tok,tag) ; tok = strtok_r(NULL,",",&tokreent) ; } #endif free(inpsave) ; } %} TAGA "%A " TAGARIS "A1 - " TAGE "%E " TAGERIS "A2 - " %% {TAGA}.+\n { /* if this is a tag followed by one or more authors, push the list * of authors, including the LF, the initial blank after the tag and all commas, to revStr(). */ revStr(yytext+2,"%A") ; } {TAGARIS}.+\n { #ifdef WILEYENDNOTEFILT_RIS /* if this is a tag followed by one or more authors, push the list * of authors, including the LF, the initial blank after the tag and all commas, to revStr(). */ revStr(yytext+5,"A1 -") ; #else fprintf(yyout,"%s",yytext) ; #endif } {TAGE}.+\n { /* Same as above for the list of editors. */ revStr(yytext+2,"%E") ; } {TAGERIS}.+\n { #ifdef WILEYENDNOTEFILT_RIS /* Same as above for the list of editors. */ revStr(yytext+5,"A2 -") ; #else fprintf(yyout,"%s",yytext) ; #endif } %% int main(int argc, char *argv[]) { ++argv, --argc; /* Skip over program name. */ if (argc > 0) yyin = fopen(argv[0], "r"); else yyin = stdin; yylex(); return 0 ; }