Project

General

Profile

Bug #1348 » teragram.c

source file to C extension - nwiger (Nate Wiger), 04/02/2009 10:53 AM

 
/*
* Adaptation of Teragram dictionaries to Ruby
*/
#include <ruby.h>
#include <vulgarityfilter.h> // Class: Dictionaries
#include "teragram.h"

//#define _VERBOSE_DEBUG
#ifdef _VERBOSE_DEBUG
# include <stdio.h>
# define teragram_printf(...) printf(__VA_ARGS__)
#else
# define teragram_printf(...)
#endif

Dictionaries gstDicts;

/* holder for ruby "Teragram" module/exceptions */
VALUE rb_cTeragram, rb_eTeragramException;

/* Filters the input string for vulgarity, masking vulgar content in-place. */
VALUE teragram_filter(VALUE self, VALUE string)
{
teragram_printf( "%s\n", __FUNCTION__ );
VALUE RetVal = Qnil;
int iStrLen = 0, ret = 0, iVulgarCounter = 0;
char* acTmpMsg = NULL;
char* acOutput = NULL;
char* input_text = StringValuePtr(string);

#if defined _VERBOSE_DEBUG
fprintf(stderr, "DEBUG: word = '%s'\n", input_text);
#endif
if( !input_text )
return (VALUE)NULL;//Give me nothing, and I'll give you nothing in return.

iStrLen = strlen(input_text);

/*//////////////////////////////////////////////////////////////////////////
// Allocate a temporary buffer to hold the filtered text.
//////////////////////////////////////////////////////////////////////////*/
acTmpMsg = (char*)malloc(iStrLen + 1);
if( !acTmpMsg )
{
/* XXX should this just print to stderr and continue instead? */
rb_raise(rb_eTeragramException, "%", "Vulgarity filter is out of memory");

/* If I can't guarantee that the text is NOT vulgar, I have to omit all
* of it. Return nothing.
*/
return (VALUE)NULL;
}

/*//////////////////////////////////////////////////////////////////////////
// Execute the SCE-RT vulgarity filter on the input text
//////////////////////////////////////////////////////////////////////////*/
ret = iMaskVulgarWords(
&gstDicts,
input_text,
acTmpMsg,
&iVulgarCounter
);

if( ret != FILTER_SUCCESS )
{
/* XXX should this just print to stderr and continue instead? */
rb_raise(rb_eTeragramException, "%", "Failed to perform full-word filtering on input text");
free(acTmpMsg);

/* If I can't guarantee that the text is NOT vulgar, I have to omit all
* of it. Return nothing.
*/
return (VALUE)NULL;
}

/*//////////////////////////////////////////////////////////////////////////
// Perform a substring filter on the content.
//////////////////////////////////////////////////////////////////////////*/
acOutput = (char*)malloc(iStrLen + 1);
ret = iMaskSubstringMatches(
gstDicts.fpat,
1,
acTmpMsg,
acOutput,
&iVulgarCounter
);

if( ret != FILTER_SUCCESS )
{
rb_raise(rb_eTeragramException, "%", "Failed to perform substring filtering on input text");
free(acOutput);
free(acTmpMsg);
return (VALUE)NULL;
}

// convert char * to Ruby string type
RetVal = rb_str_new2(acOutput);

// free our allocated buffers
free(acOutput); // filtered output as char *
free(acTmpMsg); // temp filtered string

return RetVal;
}


VALUE teragram_dictdir(teragram_t* self)
{
teragram_printf( "%s(self = 0x%x)\n", __FUNCTION__, self );
teragram_t* teragram;
Data_Get_Struct(self, teragram_t, teragram);
teragram_printf( "teragram->dictdir = '%s'\n", StringValuePtr(teragram->dictdir) );

if( self )
return teragram->dictdir;
else
return (VALUE)NULL;
}


void teragram_mark(teragram_t* self)
{
rb_gc_mark(self->dictdir);
}


void teragram_free(teragram_t* self)
{
free(self);
}


VALUE teragram_allocate(VALUE klass)
{
teragram_t *t = malloc(sizeof(teragram_t));
t->dictdir = Qnil;
return Data_Wrap_Struct(klass, teragram_mark, teragram_free, t);
}


VALUE teragram_is_vulgar(VALUE klass, VALUE string)
{
int iStrLen = 0, ret = 0;
char* input_text = StringValuePtr(string);

if( input_text && strlen(input_text) )
{
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
// Execute the SCE-RT vulgarity detection on the input text.
ret = iDetermineVulgarity(&gstDicts, input_text);
if( (ret == FILTER_FAILED) || (ret == FILTER_VULGAR) )
return Qtrue;

ret = iDetermineSubStringMatch(gstDicts.fpat, input_text);
if( (ret == FILTER_FAILED) || (ret == FILTER_VULGAR) )
return Qtrue;
}

return Qfalse;
}


VALUE teragram_initialize(VALUE self, VALUE dictdir)
{
teragram_printf( "%s( 0x%x, '%s' )\n", __FUNCTION__, self, StringValuePtr(dictdir) );
teragram_t* teragram;

if( !rb_respond_to(dictdir, rb_intern("to_s")) )
rb_raise(rb_eArgError, "dictdir must be a string that responds to to_s");

Data_Get_Struct(self, teragram_t, teragram);
teragram->dictdir = dictdir;

int ret = load_vulgarity_dictionaries(StringValuePtr(dictdir), &gstDicts);
teragram_printf( "load_vulgarity_dictionaries returned %d\n", ret );

return self;
}


void Init_teragram()
{
teragram_printf( "%s\n", __FUNCTION__ );
rb_cTeragram = rb_define_class( "Teragram", rb_cObject );
rb_eTeragramException = rb_define_class_under( rb_cTeragram, "Exception", rb_eStandardError );

rb_define_alloc_func(rb_cTeragram, teragram_allocate);

/*rb_define_module_function(rb_cTeragram, "filter", teragram_filter, 1);*/
rb_define_method(rb_cTeragram, "initialize", teragram_initialize, 1);
rb_define_method(rb_cTeragram, "dictdir", teragram_dictdir, 0);
rb_define_method(rb_cTeragram, "filter", teragram_filter, 1);
rb_define_method(rb_cTeragram, "is_vulgar?", teragram_is_vulgar, 1);
}

(1-1/2)