datum/ComputationalLinguisticsDatum.js

"use strict";

var LanguageDatum = require("./LanguageDatum").LanguageDatum;

/**
 * @class The ComputationalLinguisticsDatum represents some text and statistics about that text
 *
 * @name  ComputationalLinguisticsDatum
 * @extends LanguageDatum
 * @constructs
 */
var ComputationalLinguisticsDatum = function ComputationalLinguisticsDatum(options) {
  if (!this._fieldDBtype) {
    this._fieldDBtype = "ComputationalLinguisticsDatum";
  }
  this.debug("Constructing ComputationalLinguisticsDatum: ", options);
  LanguageDatum.apply(this, [options]);
};


ComputationalLinguisticsDatum.naiveFrequency = function(items) {
  var naiveFrequency = {};
  var tokens = 0;
  var types = 0;

  items.map(function(item) {
    tokens++;

    if (!naiveFrequency[item]) {
      types = types + 1;
      return naiveFrequency[item] = 1;
    }
    naiveFrequency[item] = naiveFrequency[item] + 1;
  });

  return {
    tokens: tokens,
    types: types,
    frequencies: naiveFrequency
  };
};

ComputationalLinguisticsDatum.prototype = Object.create(LanguageDatum.prototype, /** @lends ComputationalLinguisticsDatum.prototype */ {
  constructor: {
    value: ComputationalLinguisticsDatum
  },

  extractStats: {
    value: function(options) {
      if (options) {
        this.warn("Options aren't used", options);
      }

      var text = this.utterance || this.orthography;

      var characters = ComputationalLinguisticsDatum
        .naiveFrequency(text.split(""));
      var words = ComputationalLinguisticsDatum
        .naiveFrequency(text.split(/\s+/));

      this.stats = {
        characters: {
          unigrams: characters.frequencies,
          bigrams: {}
        },
        words: {
          unigrams: words.frequencies,
          bigrams: {}
        },
        tokens: {
          characters: {
            unigrams: characters.tokens,
            bigrams: {}
          },
          words: {
            unigrams: words.tokens,
            bigrams: {}
          }
        },
        types: {
          characters: characters.types,
          words: words.types
        }
      };
      return this;
    }
  },

  stats: {
    get: function() {
      // this.debug("get stats", this._stats);
      return this._stats;
    },
    set: function(value) {
      this._stats = value;
    }
  }

});
exports.ComputationalLinguisticsDatum = ComputationalLinguisticsDatum;