
Contents
実行環境
••
•
使うファイル群と作業用 DB
ファイル群は旧ブログ(Seesaa)の全
DB
↓
# create table kenpg_seesaa_list as select ary[1] :: date as ymd, r[1] as url, r[2] as title, coalesce(substring(str, ': ([^ ]+) :'), (select string_agg(r[1], ', ') from regexp_matches(str, '>([^<]+)', 'g') as r) ) as tags from html_raw_tmp, regexp_matches(line,'(\d{4}/\d{2}/\d{2})(.+?)(li|div)>', 'g') as ary, cast(ary[2] as text) as str, regexp_matches(str, ']*>([^<]+)') as r where r is not null; -- check result # select * from kenpg_seesaa_list order by 1 desc; +------------+------------------------------------------+-------------------------------------------------------------+----------------------------------+ | ymd | url | title | tag | +------------+------------------------------------------+-------------------------------------------------------------+----------------------------------+ | 2015-05-23 | kenpg2.seesaa.net/article/419462759.html | http://kenpg.bitbucket.org/ に移転しました | info | | 2015-04-19 | kenpg2.seesaa.net/article/417500233.html | 移転の予告(来月から) | info | | 2015-04-18 | kenpg2.seesaa.net/article/417234379.html | pywebsocket & PostgreSQL の Web クライアント(暫定) | WebSocket, Python, PostgreSQL | | 2015-04-17 | kenpg2.seesaa.net/article/417146780.html | pywebsocket でブラウザ ⇒ サーバに処理中止を指示(暫定) | WebSocket, Python | | 2015-04-16 | kenpg2.seesaa.net/article/417129799.html | ポータブルな Firefox & Python で WebSocket 通信テスト | Python, Firefox, WebSocket | ... (693 rows)

重複や
# alter table kenpg_seesaa_list add primary key(ymd); create unique index on kenpg_seesaa_list (url); alter table kenpg_seesaa_list alter url set not null; alter table kenpg_seesaa_list alter title set not null; alter table kenpg_seesaa_list alter tags set not null;
テーブル各行と、これから取り込む
# alter table kenpg_seesaa_list add html_file text; alter table kenpg_seesaa_list add html text;
↓ テーブル定義を表示して、ここまでの準備を確認。
# \d kenpg_seesaa_list Table "public.kenpg_seesaa_list" +-----------+------+-----------+ | Column | Type | Modifiers | +-----------+------+-----------+ | ymd | date | not null | | url | text | not null | | title | text | not null | | tags | text | not null | | html_file | text | | | html | text | | +-----------+------+-----------+ Indexes: "kenpg_seesaa_list_pkey" PRIMARY KEY, btree (ymd) "kenpg_seesaa_list_url_idx" UNIQUE, btree (url)

Cygwin シェルから「Windows のシンボリックリンク」を作成
今回、過去記事で何回かやったのと同様、PostgreSQL少し注意なのは、DB
今までは普通に
↓
# show data_directory \gset \\ \setenv pgdata :data_directory # \! cmd /c "mklink /d $(cygpath $pgdata -w)\r_tmp r:\tmp" symbolic link created for D:\works\PostgreSQL\9.5.3\data\r_tmp <<===>> r:\tmp
↓ フォルダ内の一覧を
# select pg_ls_dir('r_tmp'); +-----------+ | pg_ls_dir | +-----------+ | html | +-----------+ (1 row) # select pg_ls_dir('r_tmp/html'); +---------------------------+ | pg_ls_dir | +---------------------------+ | 2013-04-24_357353114.html | | 2013-04-25_356942010.html | | 2013-04-26_357352545.html | | 2013-04-27_357232034.html | | 2013-04-28_357678850.html | ... (693 rows)

DB のテーブル内に HTML ファイル名を生成し、一括読み込み
前々項で書いたとおり、各↓ いきなり
# select ymd, url, to_char(ymd, 'yyyy-mm-dd') || '_' || substring(url, '\d+\.html$') from kenpg_seesaa_list; +------------+------------------------------------------+---------------------------+ | ymd | url | ?column? | +------------+------------------------------------------+---------------------------+ | 2014-09-16 | kenpg.seesaa.net/article/405510842.html | 2014-09-16_405510842.html | | 2014-09-14 | kenpg.seesaa.net/article/405425703.html | 2014-09-14_405425703.html | | 2014-09-13 | kenpg.seesaa.net/article/405374546.html | 2014-09-13_405374546.html | | 2014-09-12 | kenpg.seesaa.net/article/405030672.html | 2014-09-12_405030672.html | | 2014-09-11 | kenpg.seesaa.net/article/404550720.html | 2014-09-11_404550720.html | ... (693 rows) # update kenpg_seesaa_list set html_file = to_char(ymd, 'yyyy-mm-dd') || '_' || substring(url, '\d+\.html$'); UPDATE 693
ファイル名が
↓ 確認例。ファイルコンテンツの長さが小さい順に出力します。想定外にファイル名が違って不存在とか、文字コードが
# select html_file, length(pg_read_file('r_tmp/html/' || html_file)) from kenpg_seesaa_list order by 2 nulls first; +---------------------------+--------+ | html_file | length | +---------------------------+--------+ | 2014-04-13_394007556.html | 14612 | | 2013-05-02_358175897.html | 14671 | | 2013-05-03_357712735.html | 14678 | | 2014-06-16_399586332.html | 14688 | | 2013-05-08_358520995.html | 14703 | ... (693 rows)
↓ 問題なさそうなので
# update kenpg_seesaa_list set html = pg_read_file('r_tmp/html/' || html_file); UPDATE 693 # select html_file, length(html) from kenpg_seesaa_list order by 2 nulls first; +---------------------------+--------+ | html_file | length | +---------------------------+--------+ | 2014-04-13_394007556.html | 14612 | | 2013-05-02_358175897.html | 14671 | | 2013-05-03_357712735.html | 14678 | | 2014-06-16_399586332.html | 14688 | | 2013-05-08_358520995.html | 14703 | ... (693 rows)

↓ ランダムに抽出して
# select html from kenpg_seesaa_list order by random() limit 1; +--------------------------------------------------------------------------- | +--------------------------------------------------------------------------- | | | | | Moran's I 統計量を算出するストアド関数 : 研究に使うポスグレ ... # select title, substring(html, '([^<]+) ') from kenpg_seesaa_list order by random() limit 1; +-----------------------------------+--------------------------------------------------------+ | title | substring | +-----------------------------------+--------------------------------------------------------+ | pgAdminⅢ で COPY … STDIN の代替策 | pgAdminⅢ で COPY … STDIN の代替策 : 研究に使うポスグレ | +-----------------------------------+--------------------------------------------------------+ (1 row)


もうシンボリックリンクは不要なので
# \! cmd /c "rmdir $(cygpath $pgdata -w)\r_tmp r:\tmp" The directory is not empty.
最後に念のため、投入した
# alter table kenpg_seesaa_list alter html_file set not null; alter table kenpg_seesaa_list alter html set not null; # select pg_size_pretty( pg_table_size( to_regclass('kenpg_seesaa_list') ) ); +----------------+ | pg_size_pretty | +----------------+ | 7464 kB | +----------------+ (1 row)
実行コマンドまとめ(全て psql 上)
記事が長くなったのでまとめ。確認部分やテーブル制約は省いてます。2# create table kenpg_seesaa_list as select ary[1] :: date as ymd, r[1] as url, r[2] as title, coalesce(substring(str, ': ([^ ]+) :'), (select string_agg(r[1], ', ') from regexp_matches(str, '>([^<]+)', 'g') as r) ) as tags from html_raw_tmp, regexp_matches(line,'(\d{4}/\d{2}/\d{2})(.+?)(li|div)>', 'g') as ary, cast(ary[2] as text) as str, regexp_matches(str, ']*>([^<]+)') as r where r is not null; # alter table kenpg_seesaa_list add html_file text; # alter table kenpg_seesaa_list add html text; # show data_directory \gset \\ \setenv pgdata :data_directory # \! cmd /c "mklink /d $(cygpath $pgdata -w)\r_tmp r:\tmp" # update kenpg_seesaa_list set html_file = to_char(ymd, 'yyyy-mm-dd') || '_' || substring(url, '\d+\.html$'); # update kenpg_seesaa_list set html = pg_read_file('r_tmp/html/' || html_file); # \! cmd /c "rmdir $(cygpath $pgdata -w)\r_tmp r:\tmp"
