Several standards are available, of which four are implemented: 1.0, 1.1, CIF2 and STAR2. CIF2 differs from STAR2 in that lists have comma separators and no nested save frames are allowed. Note that 1.0,1.1 and CIF2/STAR2 differ in their treatment of unquoted data values beginning with brackets. Because of the large commonality, we express each of the standards as slight deviations from a general standard using Noweb chunks.
Old CIF 1.0 standard. This differs from 1.1 in allowing square brackets to begin an undelimited text string.
<Lexer 1.0>= <Common v1 lexer code> <CIF1.0 data value> <Common postamble>
<Common postamble>= (<-U U-> U-> U->) lexer = lex.lex(debug=1) if __name__ == "__main__": lex.runmain(lexer)
A CIF1.0 data value allows naked square brackets at the front of undelimited data values.
<CIF1.0 data value>= (<-U) def t_DATA_VALUE_1(t): r"((?!(((S|s)(A|a)(V|v)(E|e)_[^\s]*)|((G|g)(L|l)(O|o)(B|b)(A|a)(L|l)_[^\s]*)|((S|s)(T|t)(O|o)(P|p)_[^\s]*)|((D|d)(A|a)(T|t)(A|a)_[^\s]*)))[^\s\"#$'_][^\s]*)|'(('(?=\S))|([^\n\r\f']))*'+|\"((\"(?=\S))|([^\n\r\"]))*\"+" if len(t.value)>1: if t.value[0]== '\'' and t.value[-1]=='\'': t.value = t.value[1:-1] elif t.value[0]=='"' and t.value[-1]=='"': t.value = t.value[1:-1] return t
<Lexer 1.1>= <Common v1 lexer code> <CIF1.1 data value> <Common postamble>
<CIF1.1 data value>= (<-U) def t_DATA_VALUE_1(t): r"((?!(((S|s)(A|a)(V|v)(E|e)_[^\s]*)|((G|g)(L|l)(O|o)(B|b)(A|a)(L|l)_[^\s]*)|((S|s)(T|t)(O|o)(P|p)_[^\s]*)|((D|d)(A|a)(T|t)(A|a)_[^\s]*)))[^\s\"#$'_][^\s]*)|'(('(?=\S))|([^\n\r\f']))*'+|\"((\"(?=\S))|([^\n\r\"]))*\"+" if len(t.value)>1: if t.value[0]== '\'' and t.value[-1]=='\'': t.value = t.value[1:-1] elif t.value[0]=='"' and t.value[-1]=='"': t.value = t.value[1:-1] return t
<Common v1 lexer code>= (<-U <-U) # An new lexer for CIF using PLY # import ply.lex as lex import re from StarFile import remove_line_folding,remove_line_prefix states = ( ('semicolon','exclusive'), ) tokens = ( 'COMMENT', 'WHITESPACE', 'LBLOCK', 'GLOBAL', 'STOP', 'SAVE_HEADING', 'SAVE_END', 'DATA_NAME', 'DATA_HEADING', 'START_SC_LINE', 'SC_LINE_OF_TEXT', 'END_SC_LINE', 'DATA_VALUE_1' ) t_ignore_WHITESPACE = r"([ \t\n\r](?!;))|[ \t]" t_ignore_COMMENT = r"(\#.*[\n\r](?!;))|(\#.*)" def t_error(t): print 'Illegal character %s' % repr(t.value[0]) def t_LBLOCK(t): r"(L|l)(O|o)(O|o)(P|p)_" return t def t_GLOBAL(t): r"(G|g)(L|l)(O|o)(B|b)(A|a)(L|l)_" return t def t_STOP(t): r"(S|s)(T|t)(O|o)(P|p)_" return t def t_SAVE_HEADING(t): r"(S|s)(A|a)(V|v)(E|e)_[][!%&\(\)*+,./:<=>?@0-9A-Za-z\\\\^`{}\|~\"#$';_-]+" return t def t_SAVE_END(t): r"(S|s)(A|a)(V|v)(E|e)_" return t def t_DATA_NAME(t): r"_[][!%&\(\)*+,./:<=>?@0-9A-Za-z\\\\^`{}\|~\"#$';_-]+" #_followed by stuff return t def t_DATA_HEADING(t): r"(D|d)(A|a)(T|t)(A|a)_[][!%&\(\)*+,./:<=>?@0-9A-Za-z\\\\^`{}\|~\"#$';_-]+" return t def t_START_SC_LINE(t): r"(\n|\r\n);([^\n\r])*(\r\n|\r|\n)+" t.lexer.begin('semicolon') t.lexer.sctext = t.value[t.value.find(';')+1:] def t_semicolon_SC_LINE_OF_TEXT(t): r"[^;\r\n]([^\r\n])*(\r\n|\r|\n)+" t.lexer.sctext += t.value def t_semicolon_END_SC_LINE(t): r';' t.lexer.begin('INITIAL') t.value = t.lexer.sctext[:-1] #drop eol if len(t.value)>0 and t.value[-1] == '\r': t.value = t.value[:-1] t.value = remove_line_folding(t.value) return t
<Lexer 2.0>= <Common v2 lexer code> <CIF2.0 data value> <Common postamble>
Commas are allowed in non-delimited data values in CIF2.0 but not STAR2.0. Semicolons are allowed in CIF2.0 non-delimited values as long as it is not the beginning of a line - this case should be picked up by the start_sc_line check *before* the data value check.
<CIF2.0 data value>= (<-U) def t_DATA_VALUE_1(t): r"((?!(((S|s)(A|a)(V|v)(E|e)_[^\s]*)|((G|g)(L|l)(O|o)(B|b)(A|a)(L|l)_[^\s]*)|((S|s)(T|t)(O|o)(P|p)_[^\s]*)|((D|d)(A|a)(T|t)(A|a)_[^\s]*)))[^\s\"#$'_\{\}\[\]][^\s\{\}\[\]]*)" return t
<Lexer STAR2>= <Common v2 lexer code> <STAR2.0 data value> <Common postamble>
STAR2.0 uses commas to separate list and table items so commas are not allowed in non-delimited values.
<STAR2.0 data value>= (<-U) def t_DATA_VALUE_1(t): r"((?!(((S|s)(A|a)(V|v)(E|e)_[^\s]*)|((G|g)(L|l)(O|o)(B|b)(A|a)(L|l)_[^\s]*)|((S|s)(T|t)(O|o)(P|p)_[^\s]*)|((D|d)(A|a)(T|t)(A|a)_[^\s]*)))[^\s\"#$',_\{\}\[\]][^\s,\{\}\[\]]*)" return t
The reason for switching to PLY from Yapps is that some Python builds cannot handle the wide characters allowed by our Unicode standard, and Yapps does not have any simple way to construct regular expressions conditionally.
<Common v2 lexer code>= (<-U <-U) # An new lexer for CIF using PLY # import ply.lex as lex from ply.lex import TOKEN import re,sys from StarFile import remove_line_folding,remove_line_prefix # Following unicode fix based on suggestion of Pavol Juhas # Check our Unicode status if sys.maxunicode < 111411: print 'Warning: Narrow Python build detected. Unicode characters outside the Basic Multilingual Plane are not supported' rewidechars = "" else: rewidechars = u"\U00010000-\U0010FFFD" # Define some unicode ranges to save space - not currently used non_blank_chars = u"[\u0021-\u007E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFD" + rewidechars + "]" # everything that is allowed all_chars = u"[\u0009\u000A\u000D\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFD" + rewidechars + "]" # Construct the regular expressions accordingly dname_regexp = "_" + non_blank_chars + "+" save_regexp = ur"(S|s)(A|a)(V|v)(E|e)_" + non_blank_chars + "+" dheading_regexp = ur"(D|d)(A|a)(T|t)(A|a)_"+ non_blank_chars + "+" states = ( ('semicolon','exclusive'), ('tripleq','exclusive'), ('triplea','exclusive') ) tokens = ( 'COMMENT', 'WHITESPACE', 'LBLOCK', 'GLOBAL', 'STOP', 'SAVE_HEADING', 'SAVE_END', 'DATA_NAME', 'DATA_HEADING', 'START_SC_LINE', 'SC_LINE_OF_TEXT', 'END_SC_LINE', 'DAT_VAL_NOCOMMA_NOSQ', 'DAT_VAL_INTERNAL_SQ', 'TRIPLE_QUOTE_START', 'TRIPLE_QUOTE_DATA_VALUE', 'TRIPLE_APOST_START', 'TRIPLE_APOST_DATA_VALUE', 'LINE_OF_TEXT', 'SINGLE_QUOTE_DATA_VALUE', 'DATA_VALUE_1' ) t_ignore_WHITESPACE = r"([ \t\n\r](?!;))|[ \t]" t_ignore_COMMENT = r"(\#.*[\n\r](?!;))|(\#.*)" literals = ['{','}','[',']',':'] def t_error(t): print 'Illegal character %s' % repr(t.value[0]) def t_LBLOCK(t): r"(L|l)(O|o)(O|o)(P|p)_" return t def t_GLOBAL(t): r"(G|g)(L|l)(O|o)(B|b)(A|a)(L|l)_" return t def t_STOP(t): r"(S|s)(T|t)(O|o)(P|p)_" return t @TOKEN(save_regexp) def t_SAVE_HEADING(t): return t def t_SAVE_END(t): r"(S|s)(A|a)(V|v)(E|e)_" return t @TOKEN(dname_regexp) def t_DATA_NAME(t): return t @TOKEN(dheading_regexp) def t_DATA_HEADING(t): return t def t_START_SC_LINE(t): r"(\n|\r\n);([^\n\r])*(\r\n|\r|\n)+" t.lexer.begin('semicolon') t.lexer.sctext = t.value[t.value.find(';')+1:] def t_semicolon_SC_LINE_OF_TEXT(t): r"[^;\r\n]([^\r\n])*(\r\n|\r|\n)+" t.lexer.sctext += t.value def t_semicolon_END_SC_LINE(t): r';' t.lexer.begin('INITIAL') t.value = t.lexer.sctext[:-1] #drop eol if t.value[-1] == '\r': t.value = t.value[:-1] t.value = remove_line_prefix(t.value) t.value = remove_line_folding(t.value) return t def t_DAT_VAL_INTERNAL_SQ(t): r"\[([^\s\[\]]*)\]" return t def t_TRIPLE_QUOTE_START(t): r"\"\"\"" t.lexer.begin('tripleq') t.lexer.tqval = "" def t_tripleq_TRIPLE_QUOTE_DATA_VALUE(t): r"([^\r\n]*)\"\"\"" t.lexer.begin('INITIAL') t.value = t.lexer.tqval + t.value[:-3] return t def t_tripleq_triplea_LINE_OF_TEXT(t): r"([^\r\n])*(\r\n|\r|\n)+" t.lexer.tqval += t.value def t_TRIPLE_APOST_START(t): r"'''" t.lexer.begin('triplea') t.lexer.tqval = "" def t_triplea_TRIPLE_APOST_DATA_VALUE(t): r"([^\r\n]*)'''" t.lexer.begin('INITIAL') t.value = t.lexer.tqval + t.value[:-3] return t def t_SINGLE_QUOTE_DATA_VALUE(t): r"'([^\n\r\f'])*'+|\"([^\n\r\"])*\"+" t.value = t.value[1:-1] return t