«Back

simple syntax highlighting in C

I recently implemented a syntax highlighter for C (and the likes) in C for my website. It's just a simple lexer function, wrapping keywords, identifiers, strings and numbers into <span> tags with correct classes for you (me) to style them. Just as a demo, I'll show you it's working by showing the prettify.h header, the code should speak for itself:

#pragma once

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

static void print_escaped(const char c) {
// prints a character as an escaped HTML entity
// NOTE: unicode characters may not work
// in this implementation, specifically it'll fail
// if one of the character's middle bytes is considered
// a character that starts a token (see below).
if (c == '<') printf("&lt;");
else if (c == '>') printf("&gt;");
else if (c == '&') printf("&amp;");
else if (c == '\n') printf("<br>");
else if (c == '\r') return;
else { fputc(c, stdout); }
}

static void prettify(const char *str, int (*is_keyword)(const char *id)) {
// now for the actual function
printf("<pre><code>"); // wrap the code inside a code block
for(size_t i = 0, len = strlen(str); i < len;) {
const char c = str[i]; // <- character that starts a token
// We lookahead a character, and choose a token type depending on c
if(c == '"' || c == '\'') { // string
printf("<span class=str>%c", str[i++]);
while(i < len) {
if(str[i] == c) { // matching quote
printf("%c</span>", str[i++]); // also consume quote
goto next;
} else if(str[i] == '\\') {
i++;
printf("\\%c", str[i++]);
} else print_escaped(str[i++]);
}
printf("</span>"); // in the event of an unterminated string
// (a string without a matching quote), this
// is here to end the <span> tag above.
} else if (isdigit(c)) { // number
printf("<span class=num>%c", str[i++]);
while(i < len) {
if(!(isdigit(str[i]) || str[i] == '.' /* for floats/doubles */)) {
// NOTE: this is not a correct way to parse floats/doubles
// since they only have one dot (.) character,
// so for languages like JS where this: 0..toString()
// is legal, "0.." will be considered a number token.
printf("</span>");
goto next;
} else print_escaped(str[i++]);
}
printf("</span>");
} else if (isalpha(c) || c == '$' || c == '_') { // identifier start
size_t start = i, slen = 1; i++;
while(i < len) {
const char c = str[i];
if(isalnum(c) || c == '$' || c == '_') { // identifier char
i++;
slen++;
} else break;
}
char id[slen+1]; // we're allocating the identifier on the stack
// since these tokens are typically very small.
memcpy(id, str+start, slen); id[slen] = 0;
printf("<span class=%s>%s</span>", is_keyword(id)?"keyword":"id",id);
} else if(c == '/' && str[i+1] == '/') { // inline comment
i+=2;
printf("<span class=comment>//");
while(i < len) {
if(str[i] == '\n') { // end of line
printf("</span>");
goto next;
} else print_escaped(str[i++]);
}
printf("</span>");
} else if(c == '/' && str[i+1] == '*') { // non-inline comment
i+=2;
printf("<span class=comment>/*");
while(i < len) {
if(i+1 < len && str[i] == '*' && str[i+1] == '/') {
i += 2;
printf("*/</span>");
goto next;
} else print_escaped(str[i++]);
}
} else if(c == '#') { // preprocessor lines
i++;
printf("<span class=comment>#");
while(i < len) {
if(str[i] == '\n') { // end of line
printf("</span>");
goto next;
} else print_escaped(str[i++]);
}
printf("</span>");
} else { // generic characters (like operators)
print_escaped(str[i++]);
continue;
}
next:;
}
printf("</code></pre>"); // close wrapping
}

// wrappers for prettifying code based on language:
static int prettify_c_keywords(const char *id) {
// Specifies if the identifier we just parsed is a C keyword
// See: https://en.cppreference.com/w/c/keyword
#define o(k) (strcmp(id, k) == 0)
return o("auto") || o("double") || o("int") || o("struct") || o("break")
|| o("else") || o("long") || o("switch") || o("case") || o("enum")
|| o("register") || o("typedef") || o("char") || o("extern") || o("return")
|| o("union") || o("continue") || o("for") || o("signed") || o("void")
|| o("do") || o("if") || o("static") || o("while") || o("default")
|| o("goto") || o("sizeof") || o("volatile") || o("const") || o("float")
|| o("short") || o("unsigned");
#undef o
}
#define prettify_c(str) prettify(str, prettify_c_keywords)

To style the whole thing, in sass I'd do something like:

code {
color: #666;
.keyword { color: blue; }
.id { color: #111; }
.num { color: #066; }
.str { color: green; }
.comment { color: #aaa; }
}

The code is in public domain, so do whatever you want with it!

[28/2]: I have added some comments to the code so it should be much easier for you to read out what I'm doing.