To split a str on any run of non-word characters I.e. Not A-Z, 0-9, and underscore.
var words=str.split(/W+/); // assumes str does not begin nor end with whitespace
Or, assuming your target language is English, you can extract all semantically useful values from a string (i.e. "tokenizing" a string) using:
var str='Here's a (good, bad, indifferent, ...) '+
'example sentence to be used in this test '+
'of English language "token-extraction".',
punct='\['+ '\!'+ '"'+ '\#'+ '\$'+ // since javascript does not
'\%'+ '\&'+ '\''+ '\('+ '\)'+ // support POSIX character
'\*'+ '\+'+ '\,'+ '\\'+ '\-'+ // classes, we'll need our
'\.'+ '\/'+ '\:'+ '\;'+ '\<'+ // own version of [:punct:]
'\='+ '\>'+ '\?'+ '\@'+ '\['+
'\]'+ '\^'+ '\_'+ '\`'+ '\{'+
'\|'+ '\}'+ '\~'+ '\]',
re=new RegExp( // tokenizer
'\s*'+ // discard possible leading whitespace
'('+ // start capture group
'\.{3}'+ // ellipsis (must appear before punct)
'|'+ // alternator
'\w+\-\w+'+ // hyphenated words (must appear before punct)
'|'+ // alternator
'\w+'(?:\w+)?'+ // compound words (must appear before punct)
'|'+ // alternator
'\w+'+ // other words
'|'+ // alternator
'['+punct+']'+ // punct
')' // end capture group
);
// grep(ary[,filt]) - filters an array
// note: could use jQuery.grep() instead
// @param {Array} ary array of members to filter
// @param {Function} filt function to test truthiness of member,
// if omitted, "function(member){ if(member) return member; }" is assumed
// @returns {Array} all members of ary where result of filter is truthy
function grep(ary,filt) {
var result=[];
for(var i=0,len=ary.length;i++<len;) {
var member=ary[i]||'';
if(filt && (typeof filt === 'Function') ? filt(member) : member) {
result.push(member);
}
}
return result;
}
var tokens=grep( str.split(re) ); // note: filter function omitted
// since all we need to test
// for is truthiness
which produces:
tokens=[
'Here's',
'a',
'(',
'good',
',',
'bad',
',',
'indifferent',
',',
'...',
')',
'example',
'sentence',
'to',
'be',
'used',
'in',
'this',
'test',
'of',
'English',
'language',
'"',
'token-extraction',
'"',
'.'
]
EDIT
Also available as a Github Gist