#! /bin/sh
#
# This program reads one of James Kiefer's BIO messages and formats it
# into a web page.
#
# Brian Reid, in odds and ends of leftover time, January 1998
#
# Notes: there's still a conflict between the capitalization corrector
# and the pragmat parsers. They are getting capital-repaired before they
# are parsed. Still to do: add accented members of ISO LATIN 1 to gwords
# generator, so we can use umlauts in headings. Also, the NT/OT capitalizer
# is not working. Look for brain damage.
#
#######################################################################
# "documentation", such as it is.
#
# LineMode[n] is "quote", "verse", "preformatted", "deleted", or undefined.
#
# Algorithm is 2.5 pass. First pass builds array "LText". Then we mess with
# it. Then we write it out as html.
PATH=:/usr/local/bin:/usr/local/Hughes/bin:/usr/X11R6/bin:/usr/sbin:/bin:/usr/bin:
export PATH
awk -v TOPHEAD=3 '
# concatenate the words in "Array" into a single string.
function wordcat(Array) {
N=-1;
for (i in Array) {if (i > N) N=i};
Out="";
for (i=1; i<=N; i++) {
if (i > 1) Out = Out " ";
Out = Out Array[i];
}
return(Out);
}
# count the number of instances of a character in a string
function NumInst(word,chr) {
nk=0;
for (m=1; m<=length(word); m++) {
if (substr(word,m,1)==chr) nk++
}
return(nk);
}
# return True if a word is in all capitals. Returns zero if it has a
# lowercase letter, 1 if it is all numbers and punctuation, and 2 if it
# has actual capital letters in it.
function isAllcap(word) {
if (w[k]!=toupper(w[k])) return(0);
ww=w[k];
for (gx=1; gx<=length(ww); gx++) {
if (uc[substr(ww,gx,1)]) return(2);
}
return(1);
}
# squeeze out all of the punctuation from a word
function losePunc(word) {
tw = ""; for (gg=1; gg<=length(word); gg++) {
chr=substr(word,gg,1);
if (!puncs[chr]) tw=tw chr;
}
return(tw)
}
# return true if "word" is probably a Roman numeral.
function isroman(word) {
for (jw in A) delete A[jw];
LW = length(word);
for (jw=1; jw<=LW; jw++) {
let=substr(word,jw,1);
if (!alpha[let]) continue;
if (!rom[let]) return(0);
A[let]++;
}
return(1);
}
# Capitalize a word in a title
function titleCap(word) {
tw = losePunc(word);
if (isroman(tw)) return(word);
if (KeepCaps[tw]) return(word);
nword = tolower(word)
if (lcwords[nword] != 0) return(nword);
return(initCap(nword));
}
# Change capitalization of a word found in the text in all caps
function wordCap(word) {
tw = losePunc(word);
if (isroman(tw)) return(word);
if (KeepCaps[tw]) return(word);
if (NumInst(word,".") > 1) return(toupper(word));
nword = tolower(word);
if (lcwords[nword] != 0) return(nword);
icword = initCap(nword);
return(icword);
}
# Force a word to have one initial capital
function initCap(word) {
if (KeepCaps[word]) return(word);
if (isroman(word)) return(word);
if (NumInst(word,".") > 1) return(toupper(word));
nword = tolower(word)
ji=1;
let=toupper(substr(word,ji,1));
cword = let;
while(!alpha[let]) {
let=toupper(substr(word,++ji,1));
cword = cword let;
if (ji >= length(word)) break;
}
cword = cword substr(nword,++ji);
return(cword)
}
# like index(), only right-to-left
function rindex(str,chr) {
for (i=length(str); i>0; i--) {
if (substr(str,i,1)==chr) return (i)
}
return (0);
}
# convert the characters in a word to be shell-safe
function convertWord(w) {
nw="";
for (kw=1; kw<=length(w); kw++) {
c=substr(w,kw,1);
if (SUB[c]!="") {nw=nw SUB[c]} else {nw=nw c};
}
return(nw);
}
# function to spot things that should be hotlinked, and hotlink them.
BEGIN {
# capitalization rules
lcwords["of"]=1; lcwords["the"]=1; lcwords["in"]=1; lcwords["a"]=1;
lcwords["an"]=1; lcwords["and"]=1;
MonthAbb["Jan"]="January"; MonthNum["Jan"]=1;
MonthAbb["Feb"]="February"; MonthNum["Feb"]=2;
MonthAbb["Mar"]="March"; MonthNum["Mar"]=3;
MonthAbb["Apr"]="April"; MonthNum["Apr"]=4;
MonthAbb["May"]="May"; MonthNum["May"]=5;
MonthAbb["Jun"]="June"; MonthNum["Jun"]=6;
MonthAbb["Jul"]="July"; MonthNum["Jul"]=7;
MonthAbb["Aug"]="August"; MonthNum["Aug"]=8;
MonthAbb["Sep"]="September"; MonthNum["Sep"]=9;
MonthAbb["Oct"]="October"; MonthNum["Oct"]=10;
MonthAbb["Nov"]="November"; MonthNum["Nov"]=11;
MonthAbb["Dec"]="December"; MonthNum["Dec"]=12;
for (j in MonthAbb) {
MonthNumber[MonthAbb[j]] = MonthNum[j];
MonthCode[MonthAbb[j]] = j;
}
MonthAbb["Sept"]="September"; MonthNum["Sept"]=9;
KeepCaps["ELCA"]=1; KeepCaps["ECUSA"]=1; KeepCaps["ECUSA"]=1;
KeepCaps["NT"]=1; KeepCaps["OT"]=1; KeepCaps["UTO"]=1;
KeepCaps["AD"]=1; KeepCaps["ISBN"]=1;
CapWords["Christian"]=1; CapWords["Christ"]=1; CapWords["Jesus"]=1;
character-class rules
alph="abcdefghijklmnopqrstuvwxyz"
for (i=1; i<=26;i++) {lc[substr(alph,i,1)]=1; alpha[substr(alph,i,1)]=1;}
uca=toupper(alph);
for (i=1; i<=26;i++) {uc[substr(uca,i,1)]=1; alpha[substr(uca,i,1)]=1;}
roman="IVXLCDM";
for (i=1; i<=7;i++) {rom[substr(roman,i,1)]=1;}
ipunc="(;[{";
for (i=1; i<=4;i++) {ipuncs[substr(ipunc,i,1)]=1;}
allpunc="!@#$%^&*()_+`~-={}[]:\";'\''<>,./?;|\\";
for (i=1; i<=length(allpunc);i++) {puncs[substr(allpunc,i,1)]=1;}
# character translation rules
SUB["?"]="@Q"; SUB["-"]="@-";
SUB["!"]="@E"; SUB["'\''"]="@P";
SUB["/"]="@S"; SUB["\""]="@D";
SUB["$"]="@M"; SUB["%"]="@C"; SUB["*"]="@A";
SUB["<"]="@1"; SUB[">"]="@2"; SUB["|"]="@V";
SUB["{"]="@3"; SUB["}"]="@4"; SUB["~"]="@T";
SUB["["]="@5"; SUB["]"]="@6"; SUB["`"]="@F";
SUB["("]="@7"; SUB[")"]="@8"; SUB["#"]="@Z";
SUB[":"]="@K"; SUB[";"]="@L"; SUB[","]="@,";
SUB["\\"]="@B";SUB["@"]="@@"; SUB["&"]="@N";
# ordinary initialization
Nln=0;
InHeader = 1;
BlankLine[0] = 1;
FirstText = 0;
SeenHeading = 0;
Itemizing = 0;
BLANKS=" ";
LinkCount=0;
}
################# beginning of main body of AWK program #####
/^@link/ {
LinkCount++;
NLF=split($0,linkfields,"|");
if (NLF != 3) next;
if (linkfields[3]==ENVIRON["THISFILE"]) next;
LinkCode=sprintf("${%d}",LinkCount);
LinkResult=sprintf("%s", linkfields[3], linkfields[2]);
Lcode[linkfields[2]]=LinkCode;
Link[LinkCode]=LinkResult;
LinkOrder[LinkCount] = linkfields[2];
}
{
if (FILENAME != "-") {
if (NF == 0) next;
# Allow # as comment character except in bios themselves
if (substr($0,1,1)=="#") next;
}
if (NF == 0) {
# Blank line
InHeader = 0;
if (BlankLine[Nln]) next;
BlankLine[++Nln]=1;
LText[Nln]="";
} else {
# Nonblank line.
if (($0 == " > ") || ($0 == " >") ||
($0 == " >") || ($0 == " > ")) {
BlankLine[++Nln]=1;
LText[Nln]="";
} else {
Itemized[Nln]=Itemizing;
if (InHeader) next;
LText[++Nln] = $0;
if (substr($0,1,3)==" > ") LText[Nln] = " " substr($0,4);
if (substr($0,1,4)==" > ") LText[Nln] = " " substr($0,5);
if (substr($0,1,5)==" > ") LText[Nln] = " " substr($0,6);
# Keep track of the running left margin of the text
for (i=1; i<=length(LText[Nln]); i++) {
if (substr(LText[Nln],i,1) != " ") break;
}
LeftMargin[Nln] = i;
if (i > 0) LText[Nln]=substr(LText[Nln],i);
if ((i <= 3) && (SeenHeading!= 0) && (FirstText == 0)) {
FirstText = Nln;
}
}
}
}
##### begin processing of lines beginning with =
# Lines beginning with an = are major title
# Processing of major title: break up into words, repair capitalization,
# reassemble into string, remove date from end of string, re-break into words,
# make calls to gif-word generator using those words.
/^=/ { ln=substr(LText[Nln],2)
SeenHeading = 1;
# break it up into words
nf=split(ln,f,FS);
Title[0]="";
for (j in Title) {delete Title[j]};
Title[1]=initCap(f[1]);
for (i=2; i<=nf; i++) {
# fix capitalization
Title[i]=titleCap(f[i]);
}
# reassemble into a string
TitleString=wordcat(Title);
# the last parenthesized item in the title is its date
d1=rindex(TitleString, "(");
if (d1) {
d2=rindex(TitleString,")");
if (d2 == 0) {d2 = length(TitleString);}
PT = substr(TitleString,1,d1-1);
PDate = substr(TitleString,d1+1,d2-d1-1);
TitleString = PT;
# remove the date from the end of the title string
while (substr(TitleString,length(TitleString),1)==" ") {
TitleString=substr(TitleString,1,length(TitleString)-1);
}
# split the date into day, month, and year. Use "NT" if year is missing.
if (Day == "") {
NF=split(PDate,DC," ");
Day=DC[1];
Month=initCap(DC[2]);
if (MonthAbb[Month] != "") Month = MonthAbb[Month];
if (NF > 2) { Year=DC[3];} else {Year="NT"};
if (Year=="Nt") Year="NT";
if (Year=="Ot") Year="OT";
if (Day != "") NeedImage[convertWord(Day)]=Day;
if (Month != "") NeedImage[convertWord(Month)]=Month;
if (Year != "") NeedImage[convertWord(Year)]=Year;
}
}
if (PageTitle == "") PageTitle = TitleString;
# Redivide the dateless title back into words
NTW=split(TitleString,Words,FS);
# Find the sort key for the title
if (SortKey == "") {
SortKey = Words[1];
for (g=1; g<=NTW; g++) {
if (index(Words[g],",")) {
SortKey=substr(Words[g],1,length(Words[g])-1);
break;
}
}
}
BioTitles[TitleString] = SortKey;
SortKey = "";
# Make a title display out of generated gif images of each word
LText[Nln]="\n";
for (k=1; k<=NTW; k++) {
TitleWord=convertWord(Title[k]);
NeedImage[TitleWord]=Title[k];
Wtemp=sprintf("\n",Title[k],TitleWord);
LText[Nln]=LText[Nln] Wtemp;
}
LText[Nln]=LText[Nln] "
\n";
# Leave a record that we have been here.
LineProc[Nln] = 1;
Break[Nln] = 1;
MarkupBefore[Nln] = MarkupBefore[Nln] "
";
}
##### end processing of lines beginning with =
/^PRAYER \(/ {
LText[Nln]=sprintf("
"; } } else { if (inQuote) { EnvEnd[i-1] = "blockquote"; Break[i] = 1; BlockEnd[blk] = i-1; } inQuote = 0 CRbreak = 0; } } if (inQuote) { EnvEnd[Nln] = "blockquote"; BlockEnd[blk] = Nln-1; } # # Sweep 1a: look over the indented blocks to see if any of them is # preformatted. # for (i=1; i<=blk; i++) { if (BlockIdentified[i]==0) { i1 = BlockStart[i]; i2 = BlockEnd[i]; # Look at each line in the indented block to see if it contains any # manual formatting, spacebar stuff. for (j=i1; j<=i2; j++) { line=LText[j]; for (k=LeftMargin[j];k<=length(line); k++) { if (substr(line,k,1)==" ") { nsp++; if (nsp > 2 || (((nsp > 1) && (LeftMargin[j]>0))) ) { BlockIdentified[i] = 2; EnvBegin[i1]="pre"; EnvEnd[i2]="pre"; break; } } else {nsp=0}; } if (BlockIdentified[i] != 0) break; } # If this is a
block, set the "line has been processed" flag on every # line in the block, to make sure that we dont mess with it further. if (BlockIdentified[i] == 2) { for (j=i1; j<=i2; j++) { LineProc[j]=1; LineMode[j]="preformatted"; } } } } # # Sweep 1b: look over the indented blocks to see if any of them is verse. If # the block is verse, were going to need to keep track of the left margin # so we can put in non-breaking spaces when the line is emitted. # for (i=1; i<=blk; i++) { if (BlockIdentified[i]) continue; Leftmost[i] = 999; IsVerse = 0; ShortLines = 0; VeryShortLines = 0; Jaglines = 0; if (BlockIdentified[i]==0) { i1 = BlockStart[i]; i2 = BlockEnd[i]; nlines = i2-i1+1; # Look at each line in the indented block to see if it has the margins that # make it look like poetry. And find the mimimum left margin. for (j=i1; j<=i2; j++) { if (LeftMargin[j] != "" && LeftMargin[j] < Leftmost[i]) { Leftmost[i] = LeftMargin[j]; } line=LText[j]; llen=length(line); if ((llen > 0) && (llen < 50)) ShortLines++; if ((llen > 0) && (llen < 30)) VeryShortLines++; if ((llen != 0) && (length(LText[j-1]) != 0)) { if ((j > i1) && (LeftMargin[j] != LeftMargin[j-1])) Jaglines++; } } if ((ShortLines > (nlines/4)) || (Jaglines > nlines/5)) { IsVerse = 1; } if (VeryShortLines > 1) IsVerse = 1; } if (IsVerse == 1) { BlockIdentified[i] = 3; for (j=i1; j<=i2; j++) { LineMode[j]="verse"; MinMargin[j]=Leftmost[i]; } } } # for (i=1; i<=Nln; i++) print i,LeftMargin[i],LineMode[i],substr(LText[i],1,25) # # Sweep 2: look for lines that need transformations applied # for (i=1; i<=Nln; i++) { if (LineProc[i]) continue; # Look for horizontal rules Naster = 0; for (j=1; j<=length(LText[i]); j++) { chr = substr(LText[i],j,1); if ((chr == "=") || (chr == "*") ) {Naster++;} } if (Naster > 2 && (100*Naster/length(LText[i])) > 50) { # Its a rule if (!BlankLine[i-1] || Break[i-1]) { MarkupBefore[i] = MarkupBefore[i] "
"; } LText[i] = "
"; Break[i]=1; continue; # skip the rest of Sweep 2 } # Deal with capitalization. Find each word in the line and call the # "scrutinize capitalization" procedure on it. Then reassemble the line. nw=split(LText[i],w,FS); if (nw > 0) { # find sequences of capitalized words. Probably turn them into sequences of # italicized words. This is deeply twisted logic, because it is all rather # empirical. We want things like JUSTIN MARTYR, HIS LIFE AND THOUGHT # to be italicized, but we dont want J C MARTYR to be italicized, and # we dont want AD 308 to be italicized. for (k=1; k<=nw; k++) iscap[k] = isAllcap(w[k]); capping = 0; for (k=1; k<=nw; k++) { if (iscap[k]) { if (capping) w[k]=wordCap(w[k]); else w[k]=initCap(w[k]); # Do not italicize initials, which means 2 or more single-letters in a row if (!capping && (length(w[k])==1 && length(w[k+1])==1)) continue; # do not italicize short capitalized sequences that are mostly numbers if (!capping && (iscap[k]==1) && ( (k>=nw-1) || (iscap[k+2] != 2))) continue; if ((k < nw) && iscap[k+1] && !iscap[k-1]) { w[k]="" w[k]; capping = 1; } # at the end of an italicized sequence, put in the exit-italics code. if (capping && !iscap[k+1]) { w[k]=w[k] ""; capping = 0 } # a word that begins with one of the initial punctuations characters # will end italicization. This only works some of the time. if (capping && ipuncs[substr(w[k+1],1,1)]) { w[k]=w[k] ""; capping = 0 } } } turn it off at the end of each line if (capping) {capping=0; w[nw]=w[nw] ""}; ################ end of twisted italicization logic # Look here for things like URLs and email addresses for (k=1; k<=nw; k++) { wordflag[k] = 0; if (tolower(substr(w[k],1,7))=="http://") { w[k]="" w[k] "" wordflag[k] = 1; } if (index(w[k],"@")) { ati=index(w[k],"."); right=substr(w[k],ati+1); dot=index(right,"."); if (dot > 0) { w[k] = "" w[k] "" wordflag[k] = 2; } } } # Do init-cap processing of the line unless the line contains markup. if (((wordflag[1]==0 && (Break[i-1]) || (Break[i] || BlankLine[i-1])) && (substr(w[1],1,1) != "<"))) { w[1]=initCap(w[1]); } LText[i]=w[1]; for (k=2; k<=nw; k++) { LText[i] = LText[i] " " w[k] } } # Look out for poem titles. A poem title is a line that has a break before # and after it, and for which the next substantive line is marked "verse". if ( (Break[i] || BlankLine[i-1]) && (!BlankLine[i]) && (LeftMargin[i] <= 1) && ((LineMode[i+1]=="verse") || (LineMode[i+1]=="quote") || (BlankLine[i+1] && ((LineMode[i+2]=="verse") || LineMode[i+2]=="quote")))) { S1=sprintf("",TOPHEAD+1); S2=sprintf(" ",TOPHEAD+1); MarkupBefore[i] = MarkupBefore[i] S1; MarkupAfter[i] = S2 MarkupAfter[i]; LineProc[i] = 4; } } # # Now take all "text" lines and remove the line breaks, so that we # can look for cross-linked phrases line by line. # for (i=1; i<=Nln; i++) { if (LineProc[i]) continue; if (LineMode[i]!= "") continue; if (BlankLine[i]) continue; for (j=1; j<=Nln-1; j++) { if (LineMode[i+j]=="" && (!BlankLine[i+j]) && (LText[i+j]!="")) { LText[i] = LText[i] " " LText[i+j]; LineMode[i+j]="deleted"; BlankLine[i]=0; LText[i+j]= ""; } else { break; } } } # # Process the cross-reference links # for (i=1; i<=Nln; i++) { if (LineProc[i]) continue; if (LineMode[i] == "deleted") continue; if (LineMode[i] == "verbatim") continue; for (j=1; j<=LinkCount; j++) { target=LinkOrder[j]; LinkMatch[j]=gsub(target, Lcode[target], LText[i]); } for (j=1; j<=LinkCount; j++) { target=LinkOrder[j]; if (LinkMatch[j] > 0) { gsub(Lcode[target], Link[Lcode[target]], LText[i]); } } } print "" print "" print "" print " " print " " print " " printf "%s \n",PageTitle; # Dump the table of needed images as comments in the header section. for (j in NeedImage) { printf " \n",NeedImage[j],j; } if ((Day != "") && (Month != "")) { MC = MonthCode[Month]; SortKey = sprintf("%02d%02d",MonthNumber[Month],Day); if (Year == "") { printf " \n",SortKey,Day,MC } else { printf " \n",Year,SortKey,Day,MC,Year } } for (i in BioTitles) { printf " \n",BioTitles[i],i } print "" print "" print "" # Now dump the text buffers. for (i=1; i<=Nln; i++) { if (MarkupBefore[i] != "") printf "%s",MarkupBefore[i]; if (EnvBegin[i] != "") printf "<%s>\n",EnvBegin[i]; if (LineMode[i]=="deleted") continue; if (BlankLine[i] && !Itemized[i]) { if (!Break[i-1] && !Break[i+1]) print "" } else { if (i == FirstText && (Day != "")) { print ""; printf("\n",Day,convertWord(Day)); printf("\n",Month,convertWord(Month)); printf("\n",Year,convertWord(Year)); print "
"; } if (Itemized[i] && (Break[i-1] || BlankLine[i-1])) printf "