#!/usr/bin/perl #WordMake 0.2: Dictionary Maker based on text files input #Takes as input raw text files an parses them as dictionary files #Features: # Select as many raw texts as you like # Choose words between a specified long # Choose an output File # Sorts all the words, makes them lowercase and unique # #That's all I was looking in order #to make a better Spanish Dictionary #If you find it useful, drop me a line and if #you want send me your dictionary #ToDO: Better cmd line parsing, more regular expresion filters, etc #Of course if you make it better, let me know #Author: Linga #Date (Start of the Project): 25-05-00 #Last Modified: 27-05-00 #BugFixes: #From 0.1 max and min command line parameters where ignored because of a mistype in variables. #Changes: #From 0.1 Now also accepts a hole directory where all the text files to be processed are. #Usage: ./wordmake.pl TEXT_FILE_1 TEXT_FILE_2 ... TEXT_FILE_N [-o OUTPUT_FILE] [-min MIN_WORD_LENGTH] [-max MAX_WORD_LENGTH] [-txtdir DIR_WHERE_TXTs_ARE] #Defaults... May be changed in the command line $min_long=4; $max_long=14; $fout="dict_clean.txt"; #begin parsing arguments at the command line... $i=0; $m=0; while($ARGV[$i]){ if($ARGV[$i] eq "-o"){ $i++; $fout=$ARGV[$i]; $i++; } elsif($ARGV[$i] eq "-min"){ $i++; $min_long=$ARGV[$i]; $i++; } elsif($ARGV[$i] eq "-max"){ $i++; $max_long=$ARGV[$i]; $i++; } elsif($ARGV[$i] eq "-txtdir"){ print "entro al bucle"; $i++; $txt_dir=$ARGV[$i]; $i++; $use_dir=1; print "salgo al bucle"; } else{ $rawfiles[$m]=$ARGV[$i]; $i++;$m++; } } print "Llegue aca"; open( DICT, ">$fout") or die "Can't open output file $fout..."; if ($use_dir){ opendir(DIRTXT,$txt_dir) or die "Can't access dir.. check perms..."; @rawfiles=(@rawfiles,readdir(DIRTXT)); closedir (DIRTXT); } print @rawfiles; foreach $raw_dict (@rawfiles){ if($raw_dict ne "." || $raw_dict ne ".." || $raw_dict ne "wordmake0.2.pl"){ open( RAW,$raw_dict) or die "Could't open input file $raw_dict\n"; while () { for (split) { $_=~tr/A-Z/a-z/; @a=m/\w{$min_long,$max_long}/g; if($a[0]){ $count{$a[0]}++; } } } close (RAW); } } foreach $key (sort keys %count){ print DICT "$key\n"; } close (DICT);