Source code for jubakit.burst

# coding: utf-8

from __future__ import absolute_import, division, print_function, unicode_literals

import json
import uuid

import jubatus
import jubatus.embedded

from .base import GenericSchema, BaseDataset, BaseService, GenericConfig, Utils
from .loader.array import ArrayLoader, ZipArrayLoader
from .loader.sparse import SparseMatrixLoader
from .loader.chain import ValueMapChainLoader, MergeChainLoader
from .compat import *


def _try_convert_str_to_float(value, label):
  """
  Try to convert input value to float value.
  """
  try:
    return float(value)
  except Exception:
    msg = 'Invalid parameter: {} cannot cast string to float.'.format(label)
    raise ValueError(msg)


[docs]class KeywordSchema(GenericSchema): """ Keyword schema for Burst service. """ KEYWORD = 'k' SCALING = 's' GAMMA = 'g'
[docs] def __init__(self, mapping, fallback=None): super(KeywordSchema, self).__init__(mapping, fallback) self._keyword_key = self._get_unique_mapping( mapping, fallback, self.KEYWORD, 'KEYWORD', True) self._scaling_key = self._get_unique_mapping( mapping, fallback, self.SCALING, 'SCALING', True) self._gamma_key = self._get_unique_mapping( mapping, fallback, self.GAMMA, 'GAMMA', True)
[docs] def transform(self, row): keyword = row.get(self._keyword_key, None) scaling = row.get(self._scaling_key, None) gamma = row.get(self._gamma_key, None) if keyword is None: raise RuntimeError('Row without keyword column cannot be used.') if scaling is not None: scaling = _try_convert_str_to_float(scaling, 'SCALING') if scaling <= 1: raise ValueError('Scaling parameter must be greater than 1.0.') if gamma is not None: gamma = _try_convert_str_to_float(gamma, 'GAMMA') if gamma <= 0: raise ValueError('Gamma must be greater than 1.0.') return (keyword, scaling, gamma)
[docs]class DocumentSchema(GenericSchema): """ Document schema for Burst service. """ POSITION = 'p' TEXT = 't'
[docs] def __init__(self, mapping, fallback=None): super(DocumentSchema, self).__init__(mapping, fallback) self._pos_key = self._get_unique_mapping( mapping, fallback, self.POSITION, 'POSITION', True) self._text_key = self._get_unique_mapping( mapping, fallback, self.TEXT, 'TEXT', True)
[docs] def transform(self, row): pos = row.get(self._pos_key, None) if pos is None: raise RuntimeError('Row without position column cannot be used.') pos = _try_convert_str_to_float(pos, 'POSITION') text = row.get(self._text_key, None) if text is None: text = '' return (pos, text)
[docs]class KeywordDataset(BaseDataset): """ Keyword dataset for Burst service. """ def _predict(cls, row): return KeywordSchema.predict(row, False)
[docs]class DocumentDataset(BaseDataset): """ Document dataset for Burst service. """ def _predict(cls, row): return DocumentSchema.predict(row, False)
[docs]class Burst(BaseService): """ Burst service. """ DEFAULT_SCALING = 1.1 DEFAULT_GAMMA = 0.1
[docs] @classmethod def name(cls): return 'burst'
@classmethod def _client_class(cls): return jubatus.burst.client.Burst @classmethod def _embedded_class(cls): return jubatus.embedded.Burst
[docs] def add_keyword(self, keyword_dataset): """ Registers the keyword for burst detection. """ cli = self._client() for idx, (keyword, scaling, gamma) in keyword_dataset: if scaling is None: scaling = Burst.DEFAULT_SCALING if gamma is None: gamma = Burst.DEFAULT_GAMMA result = cli.add_keyword( jubatus.burst.types.KeywordWithParams(keyword, scaling, gamma)) yield (idx, result)
[docs] def add_documents(self, document_dataset): """ Register the document for burst detection. """ cli = self._client() for (idx, (pos, text)) in document_dataset: if pos is None: raise RuntimeError('Document dataset without position ' + 'column cannot be used.') result = cli.add_documents([jubatus.burst.types.Document(pos, text)]) yield (idx, result)
[docs] def get_result(self, keyword): """ Returns the burst detection result of the current window for pre-registered keyword keyword. """ print('get_result') keyword = str(keyword) cli = self._client() return cli.get_result(keyword)
[docs] def get_result_at(self, keyword, pos): """ Returns the burst detection result at the specified position for pre-registered keyword. """ pos = _try_convert_str_to_float(pos, 'position') keyword = str(keyword) cli = self._client() return cli.get_result_at(keyword, pos)
[docs] def get_all_bursted_results(self): """ Returns the burst detection result of the current window for all pre-registered keywords. """ cli = self._client() return cli.get_all_bursted_results()
[docs] def get_all_bursted_results_at(self, pos): """ Returns the burst detection result at the specified position for all pre-registered keywords. """ pos = _try_convert_str_to_float(pos, 'position') cli = self._client() return cli.get_all_bursted_results_at(float(pos))
[docs] def get_all_keywords(self): """ Returns the list of keywords registered for burst detection. """ cli = self._client() return cli.get_all_keywords()
[docs] def remove_keyword(self, keyword): """ Removes the keyword from burst detection. """ keyword = str(keyword) cli = self._client() return cli.remove_keyword(keyword)
[docs] def remove_all_keywords(self): """ Removes all the keywords from burst detection. """ cli = self._client() return cli.remove_all_keywords()
[docs]class Config(GenericConfig): """ Configurations to run Burst service. """
[docs] def __init__(self, method=None, parameter=None, converter=None): super(Config, self).__init__(method, parameter, converter) if 'converter' in self: del self['converter']
[docs] @classmethod def methods(cls): return ['burst']
@classmethod def _default_method(cls): return 'burst' @classmethod def _default(cls, cfg): cfg.clear() method = cls._default_method() parameter = cls._default_parameter(method) if method is not None: cfg['method'] = method if parameter is not None: cfg['parameter'] = parameter @classmethod def _default_parameter(cls, method): if method != 'burst': raise RuntimeError('unknown method: {0}'.format(method)) return { "window_batch_size": 5, "batch_interval": 10, "max_reuse_batch_num": 5, "costcut_threshold": -1, "result_window_rotate_size": 5 }