`
wang_peng1
  • 浏览: 3900869 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

Counter zip dict sorted

 
阅读更多
def _read_words(filename):
  with tf.gfile.GFile(filename, "r") as f:
    if Py3:
      return f.read().replace("\n", "<eos>").split()
    else:
      return f.read().decode("utf-8").replace("\n", "<eos>").split()


def _build_vocab(filename):
  data = _read_words(filename)
  print("data:",data)
  counter = collections.Counter(data)
  print("counter:",counter)
#  temp_pairs = sorted(counter.items(), key=lambda x: print((-x[1], x[0])))
#  print(" temp_pairs:", temp_pairs)
  count_pairs = sorted(counter.items(), key=lambda x: (x[1], x[0]))
  print(" count_pairs:", count_pairs)
  words, _ = list(zip(*count_pairs))
  print("words:", words)
  word_to_id = dict(zip(words, range(len(words))))
  print("  word_to_id:",  word_to_id)
  return word_to_id

 def setUp(self):
    self._string_data = "\n".join(
        [" hello there i am",
         " rain as day",
         " want some cheesy puffs wu"])

  def testPtbRawData(self):
    tmpdir = tf.test.get_temp_dir()
    for suffix in "train", "valid", "test":
      filename = os.path.join(tmpdir, "ptb.%s.txt" % suffix)
      with tf.gfile.GFile(filename, "w") as fh:
        fh.write(self._string_data)
    # Smoke test
    output = reader.ptb_raw_data(tmpdir)

打印结果
data Tensor("PTBProducer_1/Reshape:0", shape=(3, ?), dtype=int32)
..data: ['hello', 'there', 'i', 'am<eos>', 'rain', 'as', 'day<eos>', 'want', 'some', 'cheesy', 'puffs', 'wu']
counter: Counter({'there': 1, 'wu': 1, 'as': 1, 'hello': 1, 'puffs': 1, 'am<eos>': 1, 'cheesy': 1, 'day<eos>': 1, 'some': 1, 'i': 1, 'rain': 1, 'want': 1})
 count_pairs: [('am<eos>', 1), ('as', 1), ('cheesy', 1), ('day<eos>', 1), ('hello', 1), ('i', 1), ('puffs', 1), ('rain', 1), ('some', 1), ('there', 1), ('want', 1), ('wu', 1)]
words: ('am<eos>', 'as', 'cheesy', 'day<eos>', 'hello', 'i', 'puffs', 'rain', 'some', 'there', 'want', 'wu')
  word_to_id: {'want': 10, 'there': 9, 'day<eos>': 3, 'wu': 11, 'hello': 4, 'puffs': 6, 'am<eos>': 0, 'cheesy': 2, 'rain': 7, 'some': 8, 'i': 5, 'as': 1}

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics