Recently I made up my mind about how to quickly scan a (source) string for keywords, i.e. like when writing a compiler (scanner).
Now, I am not very versed in school-book compiler techniques, so am doing it 'my way'. I have already created a compiler before (written in freebasic btw) that compiles a source in to C; and it did so pretty fast, compiling about a million lines of source code per second.
I used hashes and binary search method to identify keywords. But I think that my new technique would be faster.
I am courious what anybody else thinks about this.
However, I am not sure if a similiar technique already exists and I am only re-inventing the wheel here :D — likely it does since it is basically a form of tree traversal.
The technique explained:
I want to 'eliminate' the search and rather 'instantly' ('on the fly') find if an identifier is a keyword. Keywords consist of characters, the idea is that those characters form a path — indexes into a hierarchy of arrays (alphabets), that I populate beforehand in the appropriate way. Note: Some keywords may be subsets of other keywords, i.e. "for" occurs in "foreach"; in order to work one has two options and must strictly adhere to one of them when populating the structure: In such a case the precedence when populating is either from the shortes to the longest keyword or vice versa. I have chosen longest before shortes.
So, when I scan a source character by character, I simply traverse the hierarchy of arrays, and, if the traversal does not meet a dead end or does not end up with a wrong length, it is a keyword.
The 'length' parameter is necessary to resolve disambiguities, i.e. "for" and "foreach" would traverse along the same path but not to the same level.
Also the technique trades memory for speed and the 'garbage collector' is a little tricky too — the allocated memory has to be freed recursively.
Of course this is not a problem in environments with an automatic garbage collection :D
So far I have implemented the code in C (and Java) and am having trouble translating it into OB; that is, I do not want to produce memory holes with trying this and that, as the pointers-thing may work differently.
If someone could assist with it I would appreciate (see second code below, my version so far — not tested).
C code:
// scan.c
#include <stdio.h>
#include <stdlib.h>
#define TRUE -1
#define FALSE 0
struct N
{
struct N* n;
int l;
};
void populate(void); //init structure
int match(char*);
void gc(struct N*); //garbage collect
struct N* h;
struct N* a;
char* keywords[] = {"fi", "foreach", "for"};
int main()
{
h = (struct N*)calloc(26, sizeof(struct N));
char* keyword = "foreach";
populate();
if (match(keyword)) printf("match: \"%s\"\n", keyword); else printf("no match: \"%s\"\n", keyword);
gc(h);
free(h);
return(0);
}
void populate()
{
int k, l;
for (k = 0; k < sizeof(keywords) / sizeof(char*); k++)
{
char* c;
l = -1;
for (c = keywords[k]; *c; c++) l++; //strlen
a = h;
for (c = keywords[k]; *c; c++)
{
a += *c - 'a';
if (a->n == NULL) a->n = (struct N*)calloc(26, sizeof(struct N));
a->l = l--;
a = a->n;
}
}
}
int match(char* kword)
{
int l = -1;
a = h;
char* c;
for (c = kword; *c; c++)
{
if (*c < 'a' || *c > 'z') return(FALSE);
a += *c - 'a';
if (a->n == NULL) return(FALSE);
l = a->l;
a = a->n;
}
if (l != 0) return(FALSE);
return(TRUE);
}
void gc(struct N* n)
{
int i;
for (i = 0; i < 'z' - 'a'; i++)
{
if (n->n != NULL)
{
gc(n->n);
free(n->n);
}
++n;
}
}
OB code:
// scan.bas
#case sensitive
#define TRUE -1
#define FALSE 0
#define ASCA 97
#define ASCZ 122
#lookahead
#indexbase 0
type N
N* n
int l
end type
N* h
N* a
string keywords[2] => ("end if", "foreach", "for")
function main as integer
@h = getmemory 26 * sizeof N
string keyword = "foreach"
populate
if match keyword
print "match: " keyword
else
print "no match: " keyword
end if
gc @h
freememory @h
end function
main
sub populate
for k = 1 to spanof keywords
l = len keywords[k] - 1
@a = @h
byte* c
@c = *keywords[k]
while c
&a += c - ASCA
if @a.n == 0 then @a.n = getmemory 26 * sizeof N
a.l = l--
&a = &a.n
@c++
wend
next
end sub
function match(string kword) as integer
@a = @h
byte* c
@c = *kword
while c
if c < ASCA || c > ASCZ then return FALSE
@a += c - ASCA
if @a.n == 0 then return FALSE
l = a.l
@a = @a.n
@c++
wend
if l != 0 then return FALSE
return TRUE
end function
sub gc(N* n)
for i = 0 to ASCZ - ASCA
if @n.n != 0
gc @n.n
freememory @n.n
end if
@n++
next
end sub