DarkMatter in Cyberspace
  • Home
  • Categories
  • Tags
  • Archives

Import Data for Newfairs.com


Prerequisites

First install MongoDB, node.js and csvkit.

For node.js, download node binary package (node-v0.10.33-linux-x64.tar.gz in my case) from its website, extract and add binary folder into $PATH. For example, add the following line into ~/.zshenv:

PATH=$HOME/apps/node-v0.10.33-linux-x64/bin:$PATH

For MongoDB, download its binary package (mongodb-linux-x86_64-2.6.5.tgz in my case) from its website, extract and add binary folder into $PATH:

PATH=$HOME/apps/mongodb-linux-x86_64-2.6.5/bin:$PATH

Install csvkit with sudo pip install csvkit.

Preparation

  1. Convert Excel (xls/xlsx) file to csv file: in2csv data.xls > data.csv;

  2. (Optional) Convert file encoding: iconv -f gbk -t utf8 data20150218.csv > input.csv;

  3. Examine data: head input.csv | csvjson -i 4;

  4. Start MongoDB server for importing data: mongod;

Import Data

In case you want to overwrite old data, you need backup old collection: start a mongodb client, make a copy of the old collection (the original collection is "fairs"):

$ mongo
> db.fairs.renameCollection('fairsBak')

P.S. You can make a copy of a collection with db.fairs.copyTo('fairsBak').

Now import new data to collection "fairs":

./importdata

Under the hood

importdata:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
#!/bin/bash

# convert the csv file encoding with:
# iconv -f gbk -t utf8 xxx.csv > input.csv
INP='input.csv'
TargetDB='test'
TargetCol='fairs'

RAW='rawdata.json'
RES='result.json'
if [[ ! -f $INP ]]; then
  echo File input.csv not exists!
  exit 1
fi
rm -rf $RAW $RES
csvjson $INP > $RAW
node checkTransform
mongoimport -d $TargetDB -c $TargetCol --type json --file $RES --jsonArray
rm -rf $RAW $RES

checkTransform.js:

var data = require('./rawdata.json');
var fs = require('fs');
var result = []

data.forEach(function(elem) {
  var aFair = JSON.parse( JSON.stringify(elem) );
  aFair.indexStr = {};
  aFair.indexStr['name'] = (elem.chnName + ' ' + elem.engName).trim();
  aFair.indexStr['sponsor'] = elem.sponsor;
  aFair.indexStr['undertaker'] = elem.undertaker;
  aFair.indexStr['category'] = elem.category;
  aFair.indexStr['simpleSearch'] = elem.chnName + ' ' + elem.engName + ' ' + elem.position + ' ' + elem.time + ' ' + elem.category;

  if (elem.sponsor) {
    aFair['sponsor'] = [];
    elem.sponsor.split('|').forEach(function(spr) {
      var aSponsor = {};
      var sps = spr.split('$');
      aSponsor['name'] = sps[0];
      aSponsor['tel'] = sps[1];
      aSponsor['fax'] = sps[2];
      aSponsor['email'] = sps[3];
      aSponsor['website'] = sps[4];
      aFair.sponsor.push(aSponsor);
    });
  }

  if (elem.undertaker) {
    aFair['undertaker'] = [];
    elem.undertaker.split('|').forEach(function(udt) {
      var aUndertaker = {};
      var uds = udt.split('$');
      aUndertaker['name'] = uds[0];
      aUndertaker['tel'] = uds[1];
      aUndertaker['fax'] = uds[2];
      aUndertaker['email'] = uds[3];
      aUndertaker['website'] = uds[4];
      aFair.undertaker.push(aUndertaker);
    });
  }

  if (elem.category) {
    aFair['category'] = [];
    elem.category.split('|').forEach(function(catStr) {
      var aCat = {};
      var cats = catStr.split('$');
      aCat['major'] = cats[0] === '?' ? '' : cats[0];
      aCat['minor'] = cats.slice(1);
      aFair.category.push(aCat);
    });
  }

  result.push(aFair);
});

fs.writeFile('result.json', JSON.stringify(result));


Published

Feb 20, 2015

Last Updated

Feb 20, 2015

Category

Tech

Tags

  • csv 4
  • json 7
  • mongodb 24

Contact

  • Powered by Pelican. Theme: Elegant by Talha Mansoor